summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/autogroup.c9
-rw-r--r--kernel/sched/autogroup.h6
-rw-r--r--kernel/sched/build_policy.c6
-rw-r--r--kernel/sched/build_utility.c9
-rw-r--r--kernel/sched/clock.c7
-rw-r--r--kernel/sched/completion.c5
-rw-r--r--kernel/sched/core.c1067
-rw-r--r--kernel/sched/core_sched.c2
-rw-r--r--kernel/sched/cpuacct.c2
-rw-r--r--kernel/sched/cpudeadline.c1
-rw-r--r--kernel/sched/cpudeadline.h4
-rw-r--r--kernel/sched/cpufreq.c1
-rw-r--r--kernel/sched/cpufreq_schedutil.c64
-rw-r--r--kernel/sched/cpupri.c1
-rw-r--r--kernel/sched/cpupri.h5
-rw-r--r--kernel/sched/cputime.c17
-rw-r--r--kernel/sched/deadline.c218
-rw-r--r--kernel/sched/debug.c51
-rw-r--r--kernel/sched/ext.c2165
-rw-r--r--kernel/sched/ext.h35
-rw-r--r--kernel/sched/ext_idle.c357
-rw-r--r--kernel/sched/ext_idle.h15
-rw-r--r--kernel/sched/fair.c455
-rw-r--r--kernel/sched/idle.c15
-rw-r--r--kernel/sched/isolation.c4
-rw-r--r--kernel/sched/loadavg.c8
-rw-r--r--kernel/sched/membarrier.c2
-rw-r--r--kernel/sched/pelt.c5
-rw-r--r--kernel/sched/pelt.h67
-rw-r--r--kernel/sched/psi.c135
-rw-r--r--kernel/sched/rt.c217
-rw-r--r--kernel/sched/sched-pelt.h1
-rw-r--r--kernel/sched/sched.h282
-rw-r--r--kernel/sched/smp.h7
-rw-r--r--kernel/sched/stats.c5
-rw-r--r--kernel/sched/stats.h10
-rw-r--r--kernel/sched/stop_task.c5
-rw-r--r--kernel/sched/swait.c1
-rw-r--r--kernel/sched/syscalls.c20
-rw-r--r--kernel/sched/topology.c199
-rw-r--r--kernel/sched/wait.c23
-rw-r--r--kernel/sched/wait_bit.c3
42 files changed, 2946 insertions, 2565 deletions
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 2b331822c7e7..cdea931aae30 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -4,6 +4,9 @@
* Auto-group scheduling implementation:
*/
+#include "autogroup.h"
+#include "sched.h"
+
unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
static struct autogroup autogroup_default;
static atomic_t autogroup_seq_nr;
@@ -25,9 +28,9 @@ static void __init sched_autogroup_sysctl_init(void)
{
register_sysctl_init("kernel", sched_autogroup_sysctls);
}
-#else
+#else /* !CONFIG_SYSCTL: */
#define sched_autogroup_sysctl_init() do { } while (0)
-#endif
+#endif /* !CONFIG_SYSCTL */
void __init autogroup_init(struct task_struct *init_task)
{
@@ -108,7 +111,7 @@ static inline struct autogroup *autogroup_create(void)
free_rt_sched_group(tg);
tg->rt_se = root_task_group.rt_se;
tg->rt_rq = root_task_group.rt_rq;
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
tg->autogroup = ag;
sched_online_group(tg, &root_task_group);
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index 90d69f2c5eaf..06c82b2bdfb5 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -2,6 +2,8 @@
#ifndef _KERNEL_SCHED_AUTOGROUP_H
#define _KERNEL_SCHED_AUTOGROUP_H
+#include "sched.h"
+
#ifdef CONFIG_SCHED_AUTOGROUP
struct autogroup {
@@ -41,7 +43,7 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg)
extern int autogroup_path(struct task_group *tg, char *buf, int buflen);
-#else /* !CONFIG_SCHED_AUTOGROUP */
+#else /* !CONFIG_SCHED_AUTOGROUP: */
static inline void autogroup_init(struct task_struct *init_task) { }
static inline void autogroup_free(struct task_group *tg) { }
@@ -61,6 +63,6 @@ static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
return 0;
}
-#endif /* CONFIG_SCHED_AUTOGROUP */
+#endif /* !CONFIG_SCHED_AUTOGROUP */
#endif /* _KERNEL_SCHED_AUTOGROUP_H */
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index 72d97aa8b726..c4a488e67aa7 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -50,11 +50,9 @@
#include "idle.c"
#include "rt.c"
+#include "cpudeadline.c"
-#ifdef CONFIG_SMP
-# include "cpudeadline.c"
-# include "pelt.c"
-#endif
+#include "pelt.c"
#include "cputime.c"
#include "deadline.c"
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
index bf9d8db94b70..e2cf3b08d4e9 100644
--- a/kernel/sched/build_utility.c
+++ b/kernel/sched/build_utility.c
@@ -80,11 +80,10 @@
#include "wait_bit.c"
#include "wait.c"
-#ifdef CONFIG_SMP
-# include "cpupri.c"
-# include "stop_task.c"
-# include "topology.c"
-#endif
+#include "cpupri.c"
+#include "stop_task.c"
+
+#include "topology.c"
#ifdef CONFIG_SCHED_CORE
# include "core_sched.c"
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index a09655b48140..f5e6dd6a6b3a 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -54,6 +54,9 @@
*
*/
+#include <linux/sched/clock.h>
+#include "sched.h"
+
/*
* Scheduler clock - returns current time in nanosec units.
* This is default implementation.
@@ -471,7 +474,7 @@ notrace void sched_clock_idle_wakeup_event(void)
}
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
-#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
+#else /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK: */
void __init sched_clock_init(void)
{
@@ -489,7 +492,7 @@ notrace u64 sched_clock_cpu(int cpu)
return sched_clock();
}
-#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
+#endif /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
/*
* Running clock - returns the time that has elapsed while a guest has been
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 3561ab533dd4..19ee702273c0 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -13,6 +13,11 @@
* Waiting for completion is a typically sync point, but not an exclusion point.
*/
+#include <linux/linkage.h>
+#include <linux/sched/debug.h>
+#include <linux/completion.h>
+#include "sched.h"
+
static void complete_with_flags(struct completion *x, int wake_flags)
{
unsigned long flags;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cfaca3040b2f..be00629f0ba4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -66,10 +66,11 @@
#include <linux/vtime.h>
#include <linux/wait_api.h>
#include <linux/workqueue_api.h>
+#include <linux/livepatch_sched.h>
#ifdef CONFIG_PREEMPT_DYNAMIC
-# ifdef CONFIG_GENERIC_ENTRY
-# include <linux/entry-common.h>
+# ifdef CONFIG_GENERIC_IRQ_ENTRY
+# include <linux/irq-entry-common.h>
# endif
#endif
@@ -95,6 +96,7 @@
#include "../workqueue_internal.h"
#include "../../io_uring/io-wq.h"
#include "../smpboot.h"
+#include "../locking/mutex.h"
EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu);
EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask);
@@ -118,6 +120,35 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#ifdef CONFIG_SCHED_PROXY_EXEC
+DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
+static int __init setup_proxy_exec(char *str)
+{
+ bool proxy_enable = true;
+
+ if (*str && kstrtobool(str + 1, &proxy_enable)) {
+ pr_warn("Unable to parse sched_proxy_exec=\n");
+ return 0;
+ }
+
+ if (proxy_enable) {
+ pr_info("sched_proxy_exec enabled via boot arg\n");
+ static_branch_enable(&__sched_proxy_exec);
+ } else {
+ pr_info("sched_proxy_exec disabled via boot arg\n");
+ static_branch_disable(&__sched_proxy_exec);
+ }
+ return 1;
+}
+#else
+static int __init setup_proxy_exec(char *str)
+{
+ pr_warn("CONFIG_SCHED_PROXY_EXEC=n, so it cannot be enabled or disabled at boot time\n");
+ return 0;
+}
+#endif
+__setup("sched_proxy_exec", setup_proxy_exec);
+
/*
* Debugging: various feature bits
*
@@ -480,13 +511,13 @@ void sched_core_put(void)
schedule_work(&_work);
}
-#else /* !CONFIG_SCHED_CORE */
+#else /* !CONFIG_SCHED_CORE: */
static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
static inline void
sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
-#endif /* CONFIG_SCHED_CORE */
+#endif /* !CONFIG_SCHED_CORE */
/* need a wrapper since we may need to trace from modules */
EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp);
@@ -649,7 +680,6 @@ void raw_spin_rq_unlock(struct rq *rq)
raw_spin_unlock(rq_lockp(rq));
}
-#ifdef CONFIG_SMP
/*
* double_rq_lock - safely lock two runqueues
*/
@@ -666,7 +696,6 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2)
double_rq_clock_clear_update(rq1, rq2);
}
-#endif
/*
* __task_rq_lock - lock the rq @p resides on.
@@ -852,8 +881,6 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-#ifdef CONFIG_SMP
-
static void __hrtick_restart(struct rq *rq)
{
struct hrtimer *timer = &rq->hrtick_timer;
@@ -898,33 +925,12 @@ void hrtick_start(struct rq *rq, u64 delay)
smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
}
-#else
-/*
- * Called to set the hrtick timer state.
- *
- * called with rq->lock held and IRQs disabled
- */
-void hrtick_start(struct rq *rq, u64 delay)
-{
- /*
- * Don't schedule slices shorter than 10000ns, that just
- * doesn't make sense. Rely on vruntime for fairness.
- */
- delay = max_t(u64, delay, 10000LL);
- hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
- HRTIMER_MODE_REL_PINNED_HARD);
-}
-
-#endif /* CONFIG_SMP */
-
static void hrtick_rq_init(struct rq *rq)
{
-#ifdef CONFIG_SMP
INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
-#endif
hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
}
-#else /* CONFIG_SCHED_HRTICK */
+#else /* !CONFIG_SCHED_HRTICK: */
static inline void hrtick_clear(struct rq *rq)
{
}
@@ -932,7 +938,7 @@ static inline void hrtick_clear(struct rq *rq)
static inline void hrtick_rq_init(struct rq *rq)
{
}
-#endif /* CONFIG_SCHED_HRTICK */
+#endif /* !CONFIG_SCHED_HRTICK */
/*
* try_cmpxchg based fetch_or() macro so it works for different integer types:
@@ -948,7 +954,7 @@ static inline void hrtick_rq_init(struct rq *rq)
_val; \
})
-#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
+#ifdef TIF_POLLING_NRFLAG
/*
* Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
* this avoids any races wrt polling state changes and thereby avoids
@@ -987,13 +993,11 @@ static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
return true;
}
-#ifdef CONFIG_SMP
static inline bool set_nr_if_polling(struct task_struct *p)
{
return false;
}
#endif
-#endif
static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
{
@@ -1109,6 +1113,7 @@ static void __resched_curr(struct rq *rq, int tif)
cpu = cpu_of(rq);
+ trace_sched_set_need_resched_tp(curr, cpu, tif);
if (cpu == smp_processor_id()) {
set_ti_thread_flag(cti, tif);
if (tif == TIF_NEED_RESCHED)
@@ -1124,6 +1129,11 @@ static void __resched_curr(struct rq *rq, int tif)
}
}
+void __trace_set_need_resched(struct task_struct *curr, int tif)
+{
+ trace_sched_set_need_resched_tp(curr, smp_processor_id(), tif);
+}
+
void resched_curr(struct rq *rq)
{
__resched_curr(rq, TIF_NEED_RESCHED);
@@ -1166,7 +1176,6 @@ void resched_cpu(int cpu)
raw_spin_rq_unlock_irqrestore(rq, flags);
}
-#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ_COMMON
/*
* In the semi idle case, use the nearest busy CPU for migrating timers
@@ -1373,10 +1382,8 @@ bool sched_can_stop_tick(struct rq *rq)
return true;
}
#endif /* CONFIG_NO_HZ_FULL */
-#endif /* CONFIG_SMP */
-#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
- (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_FAIR_GROUP_SCHED)
/*
* Iterate task_group tree rooted at *from, calling @down when first entering a
* node and @up when leaving it for the final time.
@@ -1752,7 +1759,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
}
}
-static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
+static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags)
{
enum uclamp_id clamp_id;
@@ -1768,7 +1775,8 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
if (unlikely(!p->sched_class->uclamp_enabled))
return;
- if (p->se.sched_delayed)
+ /* Only inc the delayed task which being woken up. */
+ if (p->se.sched_delayed && !(flags & ENQUEUE_DELAYED))
return;
for_each_clamp_id(clamp_id)
@@ -1969,7 +1977,7 @@ undo:
sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
return result;
}
-#endif
+#endif /* CONFIG_SYSCTL */
static void uclamp_fork(struct task_struct *p)
{
@@ -2035,13 +2043,13 @@ static void __init init_uclamp(void)
}
}
-#else /* !CONFIG_UCLAMP_TASK */
-static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
+#else /* !CONFIG_UCLAMP_TASK: */
+static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) { }
static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
static inline void uclamp_fork(struct task_struct *p) { }
static inline void uclamp_post_fork(struct task_struct *p) { }
static inline void init_uclamp(void) { }
-#endif /* CONFIG_UCLAMP_TASK */
+#endif /* !CONFIG_UCLAMP_TASK */
bool sched_task_on_rq(struct task_struct *p)
{
@@ -2072,12 +2080,14 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
if (!(flags & ENQUEUE_NOCLOCK))
update_rq_clock(rq);
- p->sched_class->enqueue_task(rq, p, flags);
/*
- * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
- * ->sched_delayed.
+ * Can be before ->enqueue_task() because uclamp considers the
+ * ENQUEUE_DELAYED task before its ->sched_delayed gets cleared
+ * in ->enqueue_task().
*/
- uclamp_rq_inc(rq, p);
+ uclamp_rq_inc(rq, p, flags);
+
+ p->sched_class->enqueue_task(rq, p, flags);
psi_enqueue(p, flags);
@@ -2283,6 +2293,12 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
* just go back and repeat.
*/
rq = task_rq_lock(p, &rf);
+ /*
+ * If task is sched_delayed, force dequeue it, to avoid always
+ * hitting the tick timeout in the queued case
+ */
+ if (p->se.sched_delayed)
+ dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
trace_sched_wait_task(p);
running = task_on_cpu(rq, p);
queued = task_on_rq_queued(p);
@@ -2343,8 +2359,6 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
return ncsw;
}
-#ifdef CONFIG_SMP
-
static void
__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
@@ -2926,8 +2940,15 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
struct set_affinity_pending my_pending = { }, *pending = NULL;
bool stop_pending, complete = false;
- /* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+ /*
+ * Can the task run on the task's current CPU? If so, we're done
+ *
+ * We are also done if the task is the current donor, boosting a lock-
+ * holding proxy, (and potentially has been migrated outside its
+ * current or previous affinity mask)
+ */
+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask) ||
+ (task_current_donor(rq, p) && !task_current(rq, p))) {
struct task_struct *push_task = NULL;
if ((flags & SCA_MIGRATE_ENABLE) &&
@@ -3295,6 +3316,8 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
WARN_ON_ONCE(ret);
}
+#ifdef CONFIG_SMP
+
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
unsigned int state = READ_ONCE(p->__state);
@@ -3348,6 +3371,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
__set_task_cpu(p, new_cpu);
}
+#endif /* CONFIG_SMP */
#ifdef CONFIG_NUMA_BALANCING
static void __migrate_swap_task(struct task_struct *p, int cpu)
@@ -3647,17 +3671,6 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
}
}
-#else /* CONFIG_SMP */
-
-static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
-
-static inline bool rq_has_pinned_tasks(struct rq *rq)
-{
- return false;
-}
-
-#endif /* !CONFIG_SMP */
-
static void
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
{
@@ -3668,7 +3681,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
rq = this_rq();
-#ifdef CONFIG_SMP
if (cpu == rq->cpu) {
__schedstat_inc(rq->ttwu_local);
__schedstat_inc(p->stats.nr_wakeups_local);
@@ -3688,7 +3700,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
if (wake_flags & WF_MIGRATED)
__schedstat_inc(p->stats.nr_wakeups_migrate);
-#endif /* CONFIG_SMP */
__schedstat_inc(rq->ttwu_count);
__schedstat_inc(p->stats.nr_wakeups);
@@ -3717,13 +3728,11 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
if (p->sched_contributes_to_load)
rq->nr_uninterruptible--;
-#ifdef CONFIG_SMP
if (wake_flags & WF_RQ_SELECTED)
en_flags |= ENQUEUE_RQ_SELECTED;
if (wake_flags & WF_MIGRATED)
en_flags |= ENQUEUE_MIGRATED;
else
-#endif
if (p->in_iowait) {
delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
@@ -3734,7 +3743,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
ttwu_do_wakeup(p);
-#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
* Our task @p is fully woken up and running; so it's safe to
@@ -3756,7 +3764,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
rq->idle_stamp = 0;
}
-#endif
}
/*
@@ -3810,7 +3817,6 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
return ret;
}
-#ifdef CONFIG_SMP
void sched_ttwu_pending(void *arg)
{
struct llist_node *llist = arg;
@@ -3877,7 +3883,9 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags
p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
WRITE_ONCE(rq->ttwu_pending, 1);
+#ifdef CONFIG_SMP
__smp_call_single_queue(cpu, &p->wake_entry.llist);
+#endif
}
void wake_up_if_idle(int cpu)
@@ -3929,6 +3937,11 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
if (!scx_allow_ttwu_queue(p))
return false;
+#ifdef CONFIG_SMP
+ if (p->sched_class == &stop_sched_class)
+ return false;
+#endif
+
/*
* Do not complicate things with the async wake_list while the CPU is
* in hotplug state.
@@ -3978,15 +3991,6 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
return false;
}
-#else /* !CONFIG_SMP */
-
-static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
-{
- return false;
-}
-
-#endif /* CONFIG_SMP */
-
static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
@@ -4242,7 +4246,6 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
break;
-#ifdef CONFIG_SMP
/*
* Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
* possible to, falsely, observe p->on_cpu == 0.
@@ -4321,9 +4324,6 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
psi_ttwu_dequeue(p);
set_task_cpu(p, cpu);
}
-#else
- cpu = task_cpu(p);
-#endif /* CONFIG_SMP */
ttwu_queue(p, cpu, wake_flags);
}
@@ -4356,14 +4356,12 @@ static bool __task_needs_rq_lock(struct task_struct *p)
if (p->on_rq)
return true;
-#ifdef CONFIG_SMP
/*
* Ensure the task has finished __schedule() and will not be referenced
* anymore. Again, see try_to_wake_up() for a longer comment.
*/
smp_rmb();
smp_cond_load_acquire(&p->on_cpu, !VAL);
-#endif
return false;
}
@@ -4519,10 +4517,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->capture_control = NULL;
#endif
init_numa_balancing(clone_flags, p);
-#ifdef CONFIG_SMP
p->wake_entry.u_flags = CSD_TYPE_TTWU;
p->migration_pending = NULL;
-#endif
init_sched_mm_cid(p);
}
@@ -4585,8 +4581,8 @@ static int sysctl_numa_balancing(const struct ctl_table *table, int write,
}
return err;
}
-#endif
-#endif
+#endif /* CONFIG_PROC_SYSCTL */
+#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_SCHEDSTATS
@@ -4773,14 +4769,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
#endif
-#if defined(CONFIG_SMP)
p->on_cpu = 0;
-#endif
init_task_preempt_count(p);
-#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
-#endif
+
return 0;
}
@@ -4857,7 +4850,6 @@ void wake_up_new_task(struct task_struct *p)
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
WRITE_ONCE(p->__state, TASK_RUNNING);
-#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
* - cpus_ptr can change in the fork path
@@ -4869,7 +4861,6 @@ void wake_up_new_task(struct task_struct *p)
p->recent_used_cpu = task_cpu(p);
rseq_migrate(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags));
-#endif
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
post_init_entity_util_avg(p);
@@ -4877,7 +4868,6 @@ void wake_up_new_task(struct task_struct *p)
activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL);
trace_sched_wakeup_new(p);
wakeup_preempt(rq, p, wake_flags);
-#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
* Nothing relies on rq->lock after this, so it's fine to
@@ -4887,7 +4877,6 @@ void wake_up_new_task(struct task_struct *p)
p->sched_class->task_woken(rq, p);
rq_repin_lock(rq, &rf);
}
-#endif
task_rq_unlock(rq, p, &rf);
}
@@ -4964,7 +4953,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
__fire_sched_out_preempt_notifiers(curr, next);
}
-#else /* !CONFIG_PREEMPT_NOTIFIERS */
+#else /* !CONFIG_PREEMPT_NOTIFIERS: */
static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
@@ -4976,11 +4965,10 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
{
}
-#endif /* CONFIG_PREEMPT_NOTIFIERS */
+#endif /* !CONFIG_PREEMPT_NOTIFIERS */
static inline void prepare_task(struct task_struct *next)
{
-#ifdef CONFIG_SMP
/*
* Claim the task as running, we do this before switching to it
* such that any running task will have this set.
@@ -4989,12 +4977,10 @@ static inline void prepare_task(struct task_struct *next)
* its ordering comment.
*/
WRITE_ONCE(next->on_cpu, 1);
-#endif
}
static inline void finish_task(struct task_struct *prev)
{
-#ifdef CONFIG_SMP
/*
* This must be the very last reference to @prev from this CPU. After
* p->on_cpu is cleared, the task can be moved to a different CPU. We
@@ -5007,11 +4993,8 @@ static inline void finish_task(struct task_struct *prev)
* Pairs with the smp_cond_load_acquire() in try_to_wake_up().
*/
smp_store_release(&prev->on_cpu, 0);
-#endif
}
-#ifdef CONFIG_SMP
-
static void do_balance_callbacks(struct rq *rq, struct balance_callback *head)
{
void (*func)(struct rq *rq);
@@ -5093,14 +5076,6 @@ void balance_callbacks(struct rq *rq, struct balance_callback *head)
}
}
-#else
-
-static inline void __balance_callbacks(struct rq *rq)
-{
-}
-
-#endif
-
static inline void
prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
{
@@ -5310,7 +5285,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
* switched the context for the first time. It is returning from
* schedule for the first time in this path.
*/
- trace_sched_exit_tp(true, CALLER_ADDR0);
+ trace_sched_exit_tp(true);
preempt_enable();
if (current->set_child_tid)
@@ -5488,8 +5463,6 @@ unsigned int nr_iowait(void)
return sum;
}
-#ifdef CONFIG_SMP
-
/*
* sched_exec - execve() is a valuable balancing opportunity, because at
* this point the task has the smallest effective memory and cache footprint.
@@ -5513,8 +5486,6 @@ void sched_exec(void)
stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
}
-#endif
-
DEFINE_PER_CPU(struct kernel_stat, kstat);
DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
@@ -5549,7 +5520,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
struct rq *rq;
u64 ns;
-#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
+#ifdef CONFIG_64BIT
/*
* 64-bit doesn't need locks to atomically read a 64-bit value.
* So we have a optimization chance when the task's delta_exec is 0.
@@ -5676,12 +5647,10 @@ void sched_tick(void)
if (donor->flags & PF_WQ_WORKER)
wq_worker_tick(donor);
-#ifdef CONFIG_SMP
if (!scx_switched_all()) {
rq->idle_balance = idle_cpu(cpu);
sched_balance_trigger(rq);
}
-#endif
}
#ifdef CONFIG_NO_HZ_FULL
@@ -5821,10 +5790,10 @@ int __init sched_tick_offload_init(void)
return 0;
}
-#else /* !CONFIG_NO_HZ_FULL */
+#else /* !CONFIG_NO_HZ_FULL: */
static inline void sched_tick_start(int cpu) { }
static inline void sched_tick_stop(int cpu) { }
-#endif
+#endif /* !CONFIG_NO_HZ_FULL */
#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_TRACE_PREEMPT_TOGGLE))
@@ -6539,7 +6508,7 @@ static inline void sched_core_cpu_dying(unsigned int cpu)
rq->core = rq;
}
-#else /* !CONFIG_SCHED_CORE */
+#else /* !CONFIG_SCHED_CORE: */
static inline void sched_core_cpu_starting(unsigned int cpu) {}
static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
@@ -6551,7 +6520,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
return __pick_next_task(rq, prev, rf);
}
-#endif /* CONFIG_SCHED_CORE */
+#endif /* !CONFIG_SCHED_CORE */
/*
* Constants for the sched_mode argument of __schedule().
@@ -6567,19 +6536,33 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/*
* Helper function for __schedule()
*
- * If a task does not have signals pending, deactivate it
- * Otherwise marks the task's __state as RUNNING
+ * Tries to deactivate the task, unless the should_block arg
+ * is false or if a signal is pending. In the case a signal
+ * is pending, marks the task's __state as RUNNING (and clear
+ * blocked_on).
*/
static bool try_to_block_task(struct rq *rq, struct task_struct *p,
- unsigned long task_state)
+ unsigned long *task_state_p, bool should_block)
{
+ unsigned long task_state = *task_state_p;
int flags = DEQUEUE_NOCLOCK;
if (signal_pending_state(task_state, p)) {
WRITE_ONCE(p->__state, TASK_RUNNING);
+ *task_state_p = TASK_RUNNING;
return false;
}
+ /*
+ * We check should_block after signal_pending because we
+ * will want to wake the task in that case. But if
+ * should_block is false, its likely due to the task being
+ * blocked on a mutex, and we want to keep it on the runqueue
+ * to be selectable for proxy-execution.
+ */
+ if (!should_block)
+ return false;
+
p->sched_contributes_to_load =
(task_state & TASK_UNINTERRUPTIBLE) &&
!(task_state & TASK_NOLOAD) &&
@@ -6603,6 +6586,194 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
return true;
}
+#ifdef CONFIG_SCHED_PROXY_EXEC
+static inline struct task_struct *proxy_resched_idle(struct rq *rq)
+{
+ put_prev_set_next_task(rq, rq->donor, rq->idle);
+ rq_set_donor(rq, rq->idle);
+ set_tsk_need_resched(rq->idle);
+ return rq->idle;
+}
+
+static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor)
+{
+ unsigned long state = READ_ONCE(donor->__state);
+
+ /* Don't deactivate if the state has been changed to TASK_RUNNING */
+ if (state == TASK_RUNNING)
+ return false;
+ /*
+ * Because we got donor from pick_next_task(), it is *crucial*
+ * that we call proxy_resched_idle() before we deactivate it.
+ * As once we deactivate donor, donor->on_rq is set to zero,
+ * which allows ttwu() to immediately try to wake the task on
+ * another rq. So we cannot use *any* references to donor
+ * after that point. So things like cfs_rq->curr or rq->donor
+ * need to be changed from next *before* we deactivate.
+ */
+ proxy_resched_idle(rq);
+ return try_to_block_task(rq, donor, &state, true);
+}
+
+static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *donor)
+{
+ if (!__proxy_deactivate(rq, donor)) {
+ /*
+ * XXX: For now, if deactivation failed, set donor
+ * as unblocked, as we aren't doing proxy-migrations
+ * yet (more logic will be needed then).
+ */
+ donor->blocked_on = NULL;
+ }
+ return NULL;
+}
+
+/*
+ * Find runnable lock owner to proxy for mutex blocked donor
+ *
+ * Follow the blocked-on relation:
+ * task->blocked_on -> mutex->owner -> task...
+ *
+ * Lock order:
+ *
+ * p->pi_lock
+ * rq->lock
+ * mutex->wait_lock
+ *
+ * Returns the task that is going to be used as execution context (the one
+ * that is actually going to be run on cpu_of(rq)).
+ */
+static struct task_struct *
+find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
+{
+ struct task_struct *owner = NULL;
+ int this_cpu = cpu_of(rq);
+ struct task_struct *p;
+ struct mutex *mutex;
+
+ /* Follow blocked_on chain. */
+ for (p = donor; task_is_blocked(p); p = owner) {
+ mutex = p->blocked_on;
+ /* Something changed in the chain, so pick again */
+ if (!mutex)
+ return NULL;
+ /*
+ * By taking mutex->wait_lock we hold off concurrent mutex_unlock()
+ * and ensure @owner sticks around.
+ */
+ guard(raw_spinlock)(&mutex->wait_lock);
+
+ /* Check again that p is blocked with wait_lock held */
+ if (mutex != __get_task_blocked_on(p)) {
+ /*
+ * Something changed in the blocked_on chain and
+ * we don't know if only at this level. So, let's
+ * just bail out completely and let __schedule()
+ * figure things out (pick_again loop).
+ */
+ return NULL;
+ }
+
+ owner = __mutex_owner(mutex);
+ if (!owner) {
+ __clear_task_blocked_on(p, mutex);
+ return p;
+ }
+
+ if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) {
+ /* XXX Don't handle blocked owners/delayed dequeue yet */
+ return proxy_deactivate(rq, donor);
+ }
+
+ if (task_cpu(owner) != this_cpu) {
+ /* XXX Don't handle migrations yet */
+ return proxy_deactivate(rq, donor);
+ }
+
+ if (task_on_rq_migrating(owner)) {
+ /*
+ * One of the chain of mutex owners is currently migrating to this
+ * CPU, but has not yet been enqueued because we are holding the
+ * rq lock. As a simple solution, just schedule rq->idle to give
+ * the migration a chance to complete. Much like the migrate_task
+ * case we should end up back in find_proxy_task(), this time
+ * hopefully with all relevant tasks already enqueued.
+ */
+ return proxy_resched_idle(rq);
+ }
+
+ /*
+ * Its possible to race where after we check owner->on_rq
+ * but before we check (owner_cpu != this_cpu) that the
+ * task on another cpu was migrated back to this cpu. In
+ * that case it could slip by our checks. So double check
+ * we are still on this cpu and not migrating. If we get
+ * inconsistent results, try again.
+ */
+ if (!task_on_rq_queued(owner) || task_cpu(owner) != this_cpu)
+ return NULL;
+
+ if (owner == p) {
+ /*
+ * It's possible we interleave with mutex_unlock like:
+ *
+ * lock(&rq->lock);
+ * find_proxy_task()
+ * mutex_unlock()
+ * lock(&wait_lock);
+ * donor(owner) = current->blocked_donor;
+ * unlock(&wait_lock);
+ *
+ * wake_up_q();
+ * ...
+ * ttwu_runnable()
+ * __task_rq_lock()
+ * lock(&wait_lock);
+ * owner == p
+ *
+ * Which leaves us to finish the ttwu_runnable() and make it go.
+ *
+ * So schedule rq->idle so that ttwu_runnable() can get the rq
+ * lock and mark owner as running.
+ */
+ return proxy_resched_idle(rq);
+ }
+ /*
+ * OK, now we're absolutely sure @owner is on this
+ * rq, therefore holding @rq->lock is sufficient to
+ * guarantee its existence, as per ttwu_remote().
+ */
+ }
+
+ WARN_ON_ONCE(owner && !owner->on_rq);
+ return owner;
+}
+#else /* SCHED_PROXY_EXEC */
+static struct task_struct *
+find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
+{
+ WARN_ONCE(1, "This should never be called in the !SCHED_PROXY_EXEC case\n");
+ return donor;
+}
+#endif /* SCHED_PROXY_EXEC */
+
+static inline void proxy_tag_curr(struct rq *rq, struct task_struct *owner)
+{
+ if (!sched_proxy_exec())
+ return;
+ /*
+ * pick_next_task() calls set_next_task() on the chosen task
+ * at some point, which ensures it is not push/pullable.
+ * However, the chosen/donor task *and* the mutex owner form an
+ * atomic pair wrt push/pull.
+ *
+ * Make sure owner we run is not pushable. Unfortunately we can
+ * only deal with that by means of a dequeue/enqueue cycle. :-/
+ */
+ dequeue_task(rq, owner, DEQUEUE_NOCLOCK | DEQUEUE_SAVE);
+ enqueue_task(rq, owner, ENQUEUE_NOCLOCK | ENQUEUE_RESTORE);
+}
+
/*
* __schedule() is the main scheduler function.
*
@@ -6657,7 +6828,8 @@ static void __sched notrace __schedule(int sched_mode)
struct rq *rq;
int cpu;
- trace_sched_entry_tp(preempt, CALLER_ADDR0);
+ /* Trace preemptions consistently with task switches */
+ trace_sched_entry_tp(sched_mode == SM_PREEMPT);
cpu = smp_processor_id();
rq = cpu_rq(cpu);
@@ -6668,6 +6840,8 @@ static void __sched notrace __schedule(int sched_mode)
if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
hrtick_clear(rq);
+ klp_sched_try_switch(prev);
+
local_irq_disable();
rcu_note_context_switch(preempt);
@@ -6713,15 +6887,31 @@ static void __sched notrace __schedule(int sched_mode)
goto picked;
}
} else if (!preempt && prev_state) {
- try_to_block_task(rq, prev, prev_state);
+ /*
+ * We pass task_is_blocked() as the should_block arg
+ * in order to keep mutex-blocked tasks on the runqueue
+ * for slection with proxy-exec (without proxy-exec
+ * task_is_blocked() will always be false).
+ */
+ try_to_block_task(rq, prev, &prev_state,
+ !task_is_blocked(prev));
switch_count = &prev->nvcsw;
}
- next = pick_next_task(rq, prev, &rf);
+pick_again:
+ next = pick_next_task(rq, rq->donor, &rf);
rq_set_donor(rq, next);
+ if (unlikely(task_is_blocked(next))) {
+ next = find_proxy_task(rq, next, &rf);
+ if (!next)
+ goto pick_again;
+ if (next == rq->idle)
+ goto keep_resched;
+ }
picked:
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
+keep_resched:
rq->last_seen_need_resched_ns = 0;
is_switch = prev != next;
@@ -6732,6 +6922,10 @@ picked:
* changes to task_struct made by pick_next_task().
*/
RCU_INIT_POINTER(rq->curr, next);
+
+ if (!task_current_donor(rq, next))
+ proxy_tag_curr(rq, next);
+
/*
* The membarrier system call requires each architecture
* to have a full memory barrier after updating
@@ -6766,11 +6960,15 @@ picked:
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
} else {
+ /* In case next was already curr but just got blocked_donor */
+ if (!task_current_donor(rq, next))
+ proxy_tag_curr(rq, next);
+
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
raw_spin_rq_unlock_irq(rq);
}
- trace_sched_exit_tp(is_switch, CALLER_ADDR0);
+ trace_sched_exit_tp(is_switch);
}
void __noreturn do_task_dead(void)
@@ -6974,14 +7172,14 @@ NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
#ifdef CONFIG_PREEMPT_DYNAMIC
-#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
-#ifndef preempt_schedule_dynamic_enabled
-#define preempt_schedule_dynamic_enabled preempt_schedule
-#define preempt_schedule_dynamic_disabled NULL
-#endif
+# ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL
+# ifndef preempt_schedule_dynamic_enabled
+# define preempt_schedule_dynamic_enabled preempt_schedule
+# define preempt_schedule_dynamic_disabled NULL
+# endif
DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled);
EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
-#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule);
void __sched notrace dynamic_preempt_schedule(void)
{
@@ -6991,8 +7189,8 @@ void __sched notrace dynamic_preempt_schedule(void)
}
NOKPROBE_SYMBOL(dynamic_preempt_schedule);
EXPORT_SYMBOL(dynamic_preempt_schedule);
-#endif
-#endif
+# endif
+#endif /* CONFIG_PREEMPT_DYNAMIC */
/**
* preempt_schedule_notrace - preempt_schedule called by tracing
@@ -7047,14 +7245,14 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
#ifdef CONFIG_PREEMPT_DYNAMIC
-#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
-#ifndef preempt_schedule_notrace_dynamic_enabled
-#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace
-#define preempt_schedule_notrace_dynamic_disabled NULL
-#endif
+# if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+# ifndef preempt_schedule_notrace_dynamic_enabled
+# define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace
+# define preempt_schedule_notrace_dynamic_disabled NULL
+# endif
DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled);
EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
-#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace);
void __sched notrace dynamic_preempt_schedule_notrace(void)
{
@@ -7064,7 +7262,7 @@ void __sched notrace dynamic_preempt_schedule_notrace(void)
}
NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace);
EXPORT_SYMBOL(dynamic_preempt_schedule_notrace);
-#endif
+# endif
#endif
#endif /* CONFIG_PREEMPTION */
@@ -7283,7 +7481,7 @@ out_unlock:
preempt_enable();
}
-#endif
+#endif /* CONFIG_RT_MUTEXES */
#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
int __sched __cond_resched(void)
@@ -7314,21 +7512,20 @@ EXPORT_SYMBOL(__cond_resched);
#endif
#ifdef CONFIG_PREEMPT_DYNAMIC
-#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
-#define cond_resched_dynamic_enabled __cond_resched
-#define cond_resched_dynamic_disabled ((void *)&__static_call_return0)
+# ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL
+# define cond_resched_dynamic_enabled __cond_resched
+# define cond_resched_dynamic_disabled ((void *)&__static_call_return0)
DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
EXPORT_STATIC_CALL_TRAMP(cond_resched);
-#define might_resched_dynamic_enabled __cond_resched
-#define might_resched_dynamic_disabled ((void *)&__static_call_return0)
+# define might_resched_dynamic_enabled __cond_resched
+# define might_resched_dynamic_disabled ((void *)&__static_call_return0)
DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
EXPORT_STATIC_CALL_TRAMP(might_resched);
-#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
int __sched dynamic_cond_resched(void)
{
- klp_sched_try_switch();
if (!static_branch_unlikely(&sk_dynamic_cond_resched))
return 0;
return __cond_resched();
@@ -7343,8 +7540,8 @@ int __sched dynamic_might_resched(void)
return __cond_resched();
}
EXPORT_SYMBOL(dynamic_might_resched);
-#endif
-#endif
+# endif
+#endif /* CONFIG_PREEMPT_DYNAMIC */
/*
* __cond_resched_lock() - if a reschedule is pending, drop the given lock,
@@ -7410,9 +7607,9 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
#ifdef CONFIG_PREEMPT_DYNAMIC
-#ifdef CONFIG_GENERIC_ENTRY
-#include <linux/entry-common.h>
-#endif
+# ifdef CONFIG_GENERIC_IRQ_ENTRY
+# include <linux/irq-entry-common.h>
+# endif
/*
* SC:cond_resched
@@ -7467,40 +7664,39 @@ int preempt_dynamic_mode = preempt_dynamic_undefined;
int sched_dynamic_mode(const char *str)
{
-#ifndef CONFIG_PREEMPT_RT
+# ifndef CONFIG_PREEMPT_RT
if (!strcmp(str, "none"))
return preempt_dynamic_none;
if (!strcmp(str, "voluntary"))
return preempt_dynamic_voluntary;
-#endif
+# endif
if (!strcmp(str, "full"))
return preempt_dynamic_full;
-#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
+# ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
if (!strcmp(str, "lazy"))
return preempt_dynamic_lazy;
-#endif
+# endif
return -EINVAL;
}
-#define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key)
-#define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key)
+# define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key)
+# define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key)
-#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
-#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled)
-#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled)
-#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
-#define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f)
-#define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f)
-#else
-#error "Unsupported PREEMPT_DYNAMIC mechanism"
-#endif
+# if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+# define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled)
+# define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled)
+# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+# define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f)
+# define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f)
+# else
+# error "Unsupported PREEMPT_DYNAMIC mechanism"
+# endif
static DEFINE_MUTEX(sched_dynamic_mutex);
-static bool klp_override;
static void __sched_dynamic_update(int mode)
{
@@ -7508,8 +7704,7 @@ static void __sched_dynamic_update(int mode)
* Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
* the ZERO state, which is invalid.
*/
- if (!klp_override)
- preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_enable(cond_resched);
preempt_dynamic_enable(might_resched);
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
@@ -7518,8 +7713,7 @@ static void __sched_dynamic_update(int mode)
switch (mode) {
case preempt_dynamic_none:
- if (!klp_override)
- preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_enable(cond_resched);
preempt_dynamic_disable(might_resched);
preempt_dynamic_disable(preempt_schedule);
preempt_dynamic_disable(preempt_schedule_notrace);
@@ -7530,8 +7724,7 @@ static void __sched_dynamic_update(int mode)
break;
case preempt_dynamic_voluntary:
- if (!klp_override)
- preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_enable(cond_resched);
preempt_dynamic_enable(might_resched);
preempt_dynamic_disable(preempt_schedule);
preempt_dynamic_disable(preempt_schedule_notrace);
@@ -7542,8 +7735,7 @@ static void __sched_dynamic_update(int mode)
break;
case preempt_dynamic_full:
- if (!klp_override)
- preempt_dynamic_disable(cond_resched);
+ preempt_dynamic_disable(cond_resched);
preempt_dynamic_disable(might_resched);
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
@@ -7554,8 +7746,7 @@ static void __sched_dynamic_update(int mode)
break;
case preempt_dynamic_lazy:
- if (!klp_override)
- preempt_dynamic_disable(cond_resched);
+ preempt_dynamic_disable(cond_resched);
preempt_dynamic_disable(might_resched);
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
@@ -7576,36 +7767,6 @@ void sched_dynamic_update(int mode)
mutex_unlock(&sched_dynamic_mutex);
}
-#ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL
-
-static int klp_cond_resched(void)
-{
- __klp_sched_try_switch();
- return __cond_resched();
-}
-
-void sched_dynamic_klp_enable(void)
-{
- mutex_lock(&sched_dynamic_mutex);
-
- klp_override = true;
- static_call_update(cond_resched, klp_cond_resched);
-
- mutex_unlock(&sched_dynamic_mutex);
-}
-
-void sched_dynamic_klp_disable(void)
-{
- mutex_lock(&sched_dynamic_mutex);
-
- klp_override = false;
- __sched_dynamic_update(preempt_dynamic_mode);
-
- mutex_unlock(&sched_dynamic_mutex);
-}
-
-#endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
-
static int __init setup_preempt_mode(char *str)
{
int mode = sched_dynamic_mode(str);
@@ -7637,7 +7798,7 @@ static void __init preempt_dynamic_init(void)
}
}
-#define PREEMPT_MODEL_ACCESSOR(mode) \
+# define PREEMPT_MODEL_ACCESSOR(mode) \
bool preempt_model_##mode(void) \
{ \
WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \
@@ -7682,7 +7843,7 @@ const char *preempt_model_str(void)
if (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC)) {
seq_buf_printf(&s, "(%s)%s",
- preempt_dynamic_mode > 0 ?
+ preempt_dynamic_mode >= 0 ?
preempt_modes[preempt_dynamic_mode] : "undef",
brace ? "}" : "");
return seq_buf_str(&s);
@@ -7838,12 +7999,10 @@ void show_state_filter(unsigned int state_filter)
*/
void __init init_idle(struct task_struct *idle, int cpu)
{
-#ifdef CONFIG_SMP
struct affinity_context ac = (struct affinity_context) {
.new_mask = cpumask_of(cpu),
.flags = 0,
};
-#endif
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
@@ -7859,13 +8018,11 @@ void __init init_idle(struct task_struct *idle, int cpu)
idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY;
kthread_set_per_cpu(idle, cpu);
-#ifdef CONFIG_SMP
/*
* No validation and serialization required at boot time and for
* setting up the idle tasks of not yet online CPUs.
*/
set_cpus_allowed_common(idle, &ac);
-#endif
/*
* We're having a chicken and egg problem, even though we are
* holding rq->lock, the CPU isn't yet set to this CPU so the
@@ -7884,9 +8041,7 @@ void __init init_idle(struct task_struct *idle, int cpu)
rq_set_donor(rq, idle);
rcu_assign_pointer(rq->curr, idle);
idle->on_rq = TASK_ON_RQ_QUEUED;
-#ifdef CONFIG_SMP
idle->on_cpu = 1;
-#endif
raw_spin_rq_unlock(rq);
raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
@@ -7899,13 +8054,9 @@ void __init init_idle(struct task_struct *idle, int cpu)
idle->sched_class = &idle_sched_class;
ftrace_graph_init_idle_task(idle, cpu);
vtime_init_idle(idle, cpu);
-#ifdef CONFIG_SMP
sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
-#endif
}
-#ifdef CONFIG_SMP
-
int cpuset_cpumask_can_shrink(const struct cpumask *cur,
const struct cpumask *trial)
{
@@ -8141,7 +8292,7 @@ static void balance_hotplug_wait(void)
TASK_UNINTERRUPTIBLE);
}
-#else
+#else /* !CONFIG_HOTPLUG_CPU: */
static inline void balance_push(struct rq *rq)
{
@@ -8155,7 +8306,7 @@ static inline void balance_hotplug_wait(void)
{
}
-#endif /* CONFIG_HOTPLUG_CPU */
+#endif /* !CONFIG_HOTPLUG_CPU */
void set_rq_online(struct rq *rq)
{
@@ -8464,7 +8615,7 @@ int sched_cpu_dying(unsigned int cpu)
sched_core_cpu_dying(cpu);
return 0;
}
-#endif
+#endif /* CONFIG_HOTPLUG_CPU */
void __init sched_init_smp(void)
{
@@ -8488,6 +8639,8 @@ void __init sched_init_smp(void)
init_sched_rt_class();
init_sched_dl_class();
+ sched_init_dl_servers();
+
sched_smp_initialized = true;
}
@@ -8498,13 +8651,6 @@ static int __init migration_init(void)
}
early_initcall(migration_init);
-#else
-void __init sched_init_smp(void)
-{
- sched_init_granularity();
-}
-#endif /* CONFIG_SMP */
-
int in_sched_functions(unsigned long addr)
{
return in_lock_functions(addr) ||
@@ -8530,9 +8676,7 @@ void __init sched_init(void)
int i;
/* Make sure the linker didn't screw up */
-#ifdef CONFIG_SMP
BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class));
-#endif
BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class));
BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class));
BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class));
@@ -8563,7 +8707,7 @@ void __init sched_init(void)
init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_EXT_GROUP_SCHED
- root_task_group.scx_weight = CGROUP_WEIGHT_DFL;
+ scx_tg_init(&root_task_group);
#endif /* CONFIG_EXT_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -8575,9 +8719,7 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
}
-#ifdef CONFIG_SMP
init_defrootdomain();
-#endif
#ifdef CONFIG_RT_GROUP_SCHED
init_rt_bandwidth(&root_task_group.rt_bandwidth,
@@ -8638,7 +8780,6 @@ void __init sched_init(void)
rq->rt.rt_runtime = global_rt_runtime();
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
-#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
rq->cpu_capacity = SCHED_CAPACITY_SCALE;
@@ -8664,7 +8805,6 @@ void __init sched_init(void)
#ifdef CONFIG_HOTPLUG_CPU
rcuwait_init(&rq->hotplug_wait);
#endif
-#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
fair_server_init(rq);
@@ -8712,10 +8852,9 @@ void __init sched_init(void)
calc_load_update = jiffies + LOAD_FREQ;
-#ifdef CONFIG_SMP
idle_thread_set_boot_cpu();
+
balance_push_set(smp_processor_id(), false);
-#endif
init_sched_fair_class();
init_sched_ext_class();
@@ -8848,7 +8987,7 @@ void __cant_sleep(const char *file, int line, int preempt_offset)
}
EXPORT_SYMBOL_GPL(__cant_sleep);
-#ifdef CONFIG_SMP
+# ifdef CONFIG_SMP
void __cant_migrate(const char *file, int line)
{
static unsigned long prev_jiffy;
@@ -8879,8 +9018,8 @@ void __cant_migrate(const char *file, int line)
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
EXPORT_SYMBOL_GPL(__cant_migrate);
-#endif
-#endif
+# endif /* CONFIG_SMP */
+#endif /* CONFIG_DEBUG_ATOMIC_SLEEP */
#ifdef CONFIG_MAGIC_SYSRQ
void normalize_rt_tasks(void)
@@ -8920,7 +9059,7 @@ void normalize_rt_tasks(void)
#endif /* CONFIG_MAGIC_SYSRQ */
-#if defined(CONFIG_KGDB_KDB)
+#ifdef CONFIG_KGDB_KDB
/*
* These functions are only useful for KDB.
*
@@ -8944,7 +9083,7 @@ struct task_struct *curr_task(int cpu)
return cpu_curr(cpu);
}
-#endif /* defined(CONFIG_KGDB_KDB) */
+#endif /* CONFIG_KGDB_KDB */
#ifdef CONFIG_CGROUP_SCHED
/* task_group_lock serializes the addition/removal of task groups */
@@ -9003,7 +9142,7 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;
- scx_group_set_weight(tg, CGROUP_WEIGHT_DFL);
+ scx_tg_init(tg);
alloc_uclamp_sched_group(tg, parent);
return tg;
@@ -9018,7 +9157,7 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
unsigned long flags;
spin_lock_irqsave(&task_group_lock, flags);
- list_add_rcu(&tg->list, &task_groups);
+ list_add_tail_rcu(&tg->list, &task_groups);
/* Root should already exist: */
WARN_ON(!parent);
@@ -9204,11 +9343,15 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
struct task_struct *task;
struct cgroup_subsys_state *css;
+ if (!rt_group_sched_enabled())
+ goto scx_check;
+
cgroup_taskset_for_each(task, css, tset) {
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
}
-#endif
+scx_check:
+#endif /* CONFIG_RT_GROUP_SCHED */
return scx_cgroup_can_attach(tset);
}
@@ -9436,47 +9579,23 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
#ifdef CONFIG_CFS_BANDWIDTH
static DEFINE_MUTEX(cfs_constraints_mutex);
-const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
-static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
-/* More than 203 days if BW_SHIFT equals 20. */
-static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
-
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
-static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
- u64 burst)
+static int tg_set_cfs_bandwidth(struct task_group *tg,
+ u64 period_us, u64 quota_us, u64 burst_us)
{
int i, ret = 0, runtime_enabled, runtime_was_enabled;
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+ u64 period, quota, burst;
- if (tg == &root_task_group)
- return -EINVAL;
-
- /*
- * Ensure we have at some amount of bandwidth every period. This is
- * to prevent reaching a state of large arrears when throttled via
- * entity_tick() resulting in prolonged exit starvation.
- */
- if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
- return -EINVAL;
-
- /*
- * Likewise, bound things on the other side by preventing insane quota
- * periods. This also allows us to normalize in computing quota
- * feasibility.
- */
- if (period > max_cfs_quota_period)
- return -EINVAL;
+ period = (u64)period_us * NSEC_PER_USEC;
- /*
- * Bound quota to defend quota against overflow during bandwidth shift.
- */
- if (quota != RUNTIME_INF && quota > max_cfs_runtime)
- return -EINVAL;
+ if (quota_us == RUNTIME_INF)
+ quota = RUNTIME_INF;
+ else
+ quota = (u64)quota_us * NSEC_PER_USEC;
- if (quota != RUNTIME_INF && (burst > quota ||
- burst + quota > max_cfs_runtime))
- return -EINVAL;
+ burst = (u64)burst_us * NSEC_PER_USEC;
/*
* Prevent race between setting of cfs_rq->runtime_enabled and
@@ -9531,28 +9650,22 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
return 0;
}
-static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
+static u64 tg_get_cfs_period(struct task_group *tg)
{
- u64 quota, period, burst;
+ u64 cfs_period_us;
- period = ktime_to_ns(tg->cfs_bandwidth.period);
- burst = tg->cfs_bandwidth.burst;
- if (cfs_quota_us < 0)
- quota = RUNTIME_INF;
- else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
- quota = (u64)cfs_quota_us * NSEC_PER_USEC;
- else
- return -EINVAL;
+ cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
+ do_div(cfs_period_us, NSEC_PER_USEC);
- return tg_set_cfs_bandwidth(tg, period, quota, burst);
+ return cfs_period_us;
}
-static long tg_get_cfs_quota(struct task_group *tg)
+static u64 tg_get_cfs_quota(struct task_group *tg)
{
u64 quota_us;
if (tg->cfs_bandwidth.quota == RUNTIME_INF)
- return -1;
+ return RUNTIME_INF;
quota_us = tg->cfs_bandwidth.quota;
do_div(quota_us, NSEC_PER_USEC);
@@ -9560,45 +9673,7 @@ static long tg_get_cfs_quota(struct task_group *tg)
return quota_us;
}
-static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
-{
- u64 quota, period, burst;
-
- if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
- return -EINVAL;
-
- period = (u64)cfs_period_us * NSEC_PER_USEC;
- quota = tg->cfs_bandwidth.quota;
- burst = tg->cfs_bandwidth.burst;
-
- return tg_set_cfs_bandwidth(tg, period, quota, burst);
-}
-
-static long tg_get_cfs_period(struct task_group *tg)
-{
- u64 cfs_period_us;
-
- cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
- do_div(cfs_period_us, NSEC_PER_USEC);
-
- return cfs_period_us;
-}
-
-static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us)
-{
- u64 quota, period, burst;
-
- if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC)
- return -EINVAL;
-
- burst = (u64)cfs_burst_us * NSEC_PER_USEC;
- period = ktime_to_ns(tg->cfs_bandwidth.period);
- quota = tg->cfs_bandwidth.quota;
-
- return tg_set_cfs_bandwidth(tg, period, quota, burst);
-}
-
-static long tg_get_cfs_burst(struct task_group *tg)
+static u64 tg_get_cfs_burst(struct task_group *tg)
{
u64 burst_us;
@@ -9608,42 +9683,6 @@ static long tg_get_cfs_burst(struct task_group *tg)
return burst_us;
}
-static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return tg_get_cfs_quota(css_tg(css));
-}
-
-static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
- struct cftype *cftype, s64 cfs_quota_us)
-{
- return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
-}
-
-static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return tg_get_cfs_period(css_tg(css));
-}
-
-static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
- struct cftype *cftype, u64 cfs_period_us)
-{
- return tg_set_cfs_period(css_tg(css), cfs_period_us);
-}
-
-static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return tg_get_cfs_burst(css_tg(css));
-}
-
-static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css,
- struct cftype *cftype, u64 cfs_burst_us)
-{
- return tg_set_cfs_burst(css_tg(css), cfs_burst_us);
-}
-
struct cfs_schedulable_data {
struct task_group *tg;
u64 period, quota;
@@ -9778,6 +9817,143 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
}
#endif /* CONFIG_CFS_BANDWIDTH */
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
+const u64 max_bw_quota_period_us = 1 * USEC_PER_SEC; /* 1s */
+static const u64 min_bw_quota_period_us = 1 * USEC_PER_MSEC; /* 1ms */
+/* More than 203 days if BW_SHIFT equals 20. */
+static const u64 max_bw_runtime_us = MAX_BW;
+
+static void tg_bandwidth(struct task_group *tg,
+ u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p)
+{
+#ifdef CONFIG_CFS_BANDWIDTH
+ if (period_us_p)
+ *period_us_p = tg_get_cfs_period(tg);
+ if (quota_us_p)
+ *quota_us_p = tg_get_cfs_quota(tg);
+ if (burst_us_p)
+ *burst_us_p = tg_get_cfs_burst(tg);
+#else /* !CONFIG_CFS_BANDWIDTH */
+ if (period_us_p)
+ *period_us_p = tg->scx.bw_period_us;
+ if (quota_us_p)
+ *quota_us_p = tg->scx.bw_quota_us;
+ if (burst_us_p)
+ *burst_us_p = tg->scx.bw_burst_us;
+#endif /* CONFIG_CFS_BANDWIDTH */
+}
+
+static u64 cpu_period_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 period_us;
+
+ tg_bandwidth(css_tg(css), &period_us, NULL, NULL);
+ return period_us;
+}
+
+static int tg_set_bandwidth(struct task_group *tg,
+ u64 period_us, u64 quota_us, u64 burst_us)
+{
+ const u64 max_usec = U64_MAX / NSEC_PER_USEC;
+ int ret = 0;
+
+ if (tg == &root_task_group)
+ return -EINVAL;
+
+ /* Values should survive translation to nsec */
+ if (period_us > max_usec ||
+ (quota_us != RUNTIME_INF && quota_us > max_usec) ||
+ burst_us > max_usec)
+ return -EINVAL;
+
+ /*
+ * Ensure we have some amount of bandwidth every period. This is to
+ * prevent reaching a state of large arrears when throttled via
+ * entity_tick() resulting in prolonged exit starvation.
+ */
+ if (quota_us < min_bw_quota_period_us ||
+ period_us < min_bw_quota_period_us)
+ return -EINVAL;
+
+ /*
+ * Likewise, bound things on the other side by preventing insane quota
+ * periods. This also allows us to normalize in computing quota
+ * feasibility.
+ */
+ if (period_us > max_bw_quota_period_us)
+ return -EINVAL;
+
+ /*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (quota_us != RUNTIME_INF && quota_us > max_bw_runtime_us)
+ return -EINVAL;
+
+ if (quota_us != RUNTIME_INF && (burst_us > quota_us ||
+ burst_us + quota_us > max_bw_runtime_us))
+ return -EINVAL;
+
+#ifdef CONFIG_CFS_BANDWIDTH
+ ret = tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us);
+#endif /* CONFIG_CFS_BANDWIDTH */
+ if (!ret)
+ scx_group_set_bandwidth(tg, period_us, quota_us, burst_us);
+ return ret;
+}
+
+static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 quota_us;
+
+ tg_bandwidth(css_tg(css), NULL, &quota_us, NULL);
+ return quota_us; /* (s64)RUNTIME_INF becomes -1 */
+}
+
+static u64 cpu_burst_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 burst_us;
+
+ tg_bandwidth(css_tg(css), NULL, NULL, &burst_us);
+ return burst_us;
+}
+
+static int cpu_period_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 period_us)
+{
+ struct task_group *tg = css_tg(css);
+ u64 quota_us, burst_us;
+
+ tg_bandwidth(tg, NULL, &quota_us, &burst_us);
+ return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
+}
+
+static int cpu_quota_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, s64 quota_us)
+{
+ struct task_group *tg = css_tg(css);
+ u64 period_us, burst_us;
+
+ if (quota_us < 0)
+ quota_us = RUNTIME_INF;
+
+ tg_bandwidth(tg, &period_us, NULL, &burst_us);
+ return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
+}
+
+static int cpu_burst_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 burst_us)
+{
+ struct task_group *tg = css_tg(css);
+ u64 period_us, quota_us;
+
+ tg_bandwidth(tg, &period_us, &quota_us, NULL);
+ return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
+}
+#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */
+
#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
struct cftype *cft, s64 val)
@@ -9821,7 +9997,7 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
scx_group_set_idle(css_tg(css), idle);
return ret;
}
-#endif
+#endif /* CONFIG_GROUP_SCHED_WEIGHT */
static struct cftype cpu_legacy_files[] = {
#ifdef CONFIG_GROUP_SCHED_WEIGHT
@@ -9836,22 +10012,24 @@ static struct cftype cpu_legacy_files[] = {
.write_s64 = cpu_idle_write_s64,
},
#endif
-#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
{
- .name = "cfs_quota_us",
- .read_s64 = cpu_cfs_quota_read_s64,
- .write_s64 = cpu_cfs_quota_write_s64,
+ .name = "cfs_period_us",
+ .read_u64 = cpu_period_read_u64,
+ .write_u64 = cpu_period_write_u64,
},
{
- .name = "cfs_period_us",
- .read_u64 = cpu_cfs_period_read_u64,
- .write_u64 = cpu_cfs_period_write_u64,
+ .name = "cfs_quota_us",
+ .read_s64 = cpu_quota_read_s64,
+ .write_s64 = cpu_quota_write_s64,
},
{
.name = "cfs_burst_us",
- .read_u64 = cpu_cfs_burst_read_u64,
- .write_u64 = cpu_cfs_burst_write_u64,
+ .read_u64 = cpu_burst_read_u64,
+ .write_u64 = cpu_burst_write_u64,
},
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
{
.name = "stat",
.seq_show = cpu_cfs_stat_show,
@@ -9861,18 +10039,6 @@ static struct cftype cpu_legacy_files[] = {
.seq_show = cpu_cfs_local_stat_show,
},
#endif
-#ifdef CONFIG_RT_GROUP_SCHED
- {
- .name = "rt_runtime_us",
- .read_s64 = cpu_rt_runtime_read,
- .write_s64 = cpu_rt_runtime_write,
- },
- {
- .name = "rt_period_us",
- .read_u64 = cpu_rt_period_read_uint,
- .write_u64 = cpu_rt_period_write_uint,
- },
-#endif
#ifdef CONFIG_UCLAMP_TASK_GROUP
{
.name = "uclamp.min",
@@ -9890,6 +10056,55 @@ static struct cftype cpu_legacy_files[] = {
{ } /* Terminate */
};
+#ifdef CONFIG_RT_GROUP_SCHED
+static struct cftype rt_group_files[] = {
+ {
+ .name = "rt_runtime_us",
+ .read_s64 = cpu_rt_runtime_read,
+ .write_s64 = cpu_rt_runtime_write,
+ },
+ {
+ .name = "rt_period_us",
+ .read_u64 = cpu_rt_period_read_uint,
+ .write_u64 = cpu_rt_period_write_uint,
+ },
+ { } /* Terminate */
+};
+
+# ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED
+DEFINE_STATIC_KEY_FALSE(rt_group_sched);
+# else
+DEFINE_STATIC_KEY_TRUE(rt_group_sched);
+# endif
+
+static int __init setup_rt_group_sched(char *str)
+{
+ long val;
+
+ if (kstrtol(str, 0, &val) || val < 0 || val > 1) {
+ pr_warn("Unable to set rt_group_sched\n");
+ return 1;
+ }
+ if (val)
+ static_branch_enable(&rt_group_sched);
+ else
+ static_branch_disable(&rt_group_sched);
+
+ return 1;
+}
+__setup("rt_group_sched=", setup_rt_group_sched);
+
+static int __init cpu_rt_group_init(void)
+{
+ if (!rt_group_sched_enabled())
+ return 0;
+
+ WARN_ON(cgroup_add_legacy_cftypes(&cpu_cgrp_subsys, rt_group_files));
+ return 0;
+}
+subsys_initcall(cpu_rt_group_init);
+#endif /* CONFIG_RT_GROUP_SCHED */
+
static int cpu_extra_stat_show(struct seq_file *sf,
struct cgroup_subsys_state *css)
{
@@ -9912,7 +10127,7 @@ static int cpu_extra_stat_show(struct seq_file *sf,
cfs_b->nr_periods, cfs_b->nr_throttled,
throttled_usec, cfs_b->nr_burst, burst_usec);
}
-#endif
+#endif /* CONFIG_CFS_BANDWIDTH */
return 0;
}
@@ -10010,32 +10225,32 @@ static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
}
/* caller should put the current value in *@periodp before calling */
-static int __maybe_unused cpu_period_quota_parse(char *buf,
- u64 *periodp, u64 *quotap)
+static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *period_us_p,
+ u64 *quota_us_p)
{
char tok[21]; /* U64_MAX */
- if (sscanf(buf, "%20s %llu", tok, periodp) < 1)
+ if (sscanf(buf, "%20s %llu", tok, period_us_p) < 1)
return -EINVAL;
- *periodp *= NSEC_PER_USEC;
-
- if (sscanf(tok, "%llu", quotap))
- *quotap *= NSEC_PER_USEC;
- else if (!strcmp(tok, "max"))
- *quotap = RUNTIME_INF;
- else
- return -EINVAL;
+ if (sscanf(tok, "%llu", quota_us_p) < 1) {
+ if (!strcmp(tok, "max"))
+ *quota_us_p = RUNTIME_INF;
+ else
+ return -EINVAL;
+ }
return 0;
}
-#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
static int cpu_max_show(struct seq_file *sf, void *v)
{
struct task_group *tg = css_tg(seq_css(sf));
+ u64 period_us, quota_us;
- cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
+ tg_bandwidth(tg, &period_us, &quota_us, NULL);
+ cpu_period_quota_print(sf, period_us, quota_us);
return 0;
}
@@ -10043,17 +10258,16 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct task_group *tg = css_tg(of_css(of));
- u64 period = tg_get_cfs_period(tg);
- u64 burst = tg->cfs_bandwidth.burst;
- u64 quota;
+ u64 period_us, quota_us, burst_us;
int ret;
- ret = cpu_period_quota_parse(buf, &period, &quota);
+ tg_bandwidth(tg, &period_us, NULL, &burst_us);
+ ret = cpu_period_quota_parse(buf, &period_us, &quota_us);
if (!ret)
- ret = tg_set_cfs_bandwidth(tg, period, quota, burst);
+ ret = tg_set_bandwidth(tg, period_us, quota_us, burst_us);
return ret ?: nbytes;
}
-#endif
+#endif /* CONFIG_CFS_BANDWIDTH */
static struct cftype cpu_files[] = {
#ifdef CONFIG_GROUP_SCHED_WEIGHT
@@ -10076,7 +10290,7 @@ static struct cftype cpu_files[] = {
.write_s64 = cpu_idle_write_s64,
},
#endif
-#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
{
.name = "max",
.flags = CFTYPE_NOT_ON_ROOT,
@@ -10086,10 +10300,10 @@ static struct cftype cpu_files[] = {
{
.name = "max.burst",
.flags = CFTYPE_NOT_ON_ROOT,
- .read_u64 = cpu_cfs_burst_read_u64,
- .write_u64 = cpu_cfs_burst_write_u64,
+ .read_u64 = cpu_burst_read_u64,
+ .write_u64 = cpu_burst_write_u64,
},
-#endif
+#endif /* CONFIG_CFS_BANDWIDTH */
#ifdef CONFIG_UCLAMP_TASK_GROUP
{
.name = "uclamp.min",
@@ -10103,7 +10317,7 @@ static struct cftype cpu_files[] = {
.seq_show = cpu_uclamp_max_show,
.write = cpu_uclamp_max_write,
},
-#endif
+#endif /* CONFIG_UCLAMP_TASK_GROUP */
{ } /* terminate */
};
@@ -10124,7 +10338,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.threaded = true,
};
-#endif /* CONFIG_CGROUP_SCHED */
+#endif /* CONFIG_CGROUP_SCHED */
void dump_cpu_task(int cpu)
{
@@ -10703,7 +10917,6 @@ void sched_mm_cid_after_execve(struct task_struct *t)
smp_mb();
t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm);
}
- rseq_set_notify_resume(t);
}
void sched_mm_cid_fork(struct task_struct *t)
@@ -10711,7 +10924,7 @@ void sched_mm_cid_fork(struct task_struct *t)
WARN_ON_ONCE(!t->mm || t->mm_cid != -1);
t->mm_cid_active = 1;
}
-#endif
+#endif /* CONFIG_SCHED_MM_CID */
#ifdef CONFIG_SCHED_CLASS_EXT
void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
@@ -10746,4 +10959,4 @@ void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
if (ctx->running)
set_next_task(rq, ctx->p);
}
-#endif /* CONFIG_SCHED_CLASS_EXT */
+#endif /* CONFIG_SCHED_CLASS_EXT */
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index c4606ca89210..9ede71ecba7f 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -4,6 +4,8 @@
* A simple wrapper around refcount. An allocated sched_core_cookie's
* address is used to compute the cookie of the task.
*/
+#include "sched.h"
+
struct sched_core_cookie {
refcount_t refcnt;
};
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 0de9dda09949..23a56ba12d81 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -6,6 +6,8 @@
* Based on the work by Paul Menage (menage@google.com) and Balbir Singh
* (balbir@in.ibm.com).
*/
+#include <linux/sched/cputime.h>
+#include "sched.h"
/* Time spent by the tasks of the CPU accounting group executing in ... */
enum cpuacct_stat_index {
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 95baa12a1029..cdd740b3f774 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -6,6 +6,7 @@
*
* Author: Juri Lelli <j.lelli@sssup.it>
*/
+#include "sched.h"
static inline int parent(int i)
{
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 0adeda93b5fb..11c0f1faa7e1 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -1,4 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/types.h>
+#include <linux/spinlock.h>
#define IDX_INVALID -1
@@ -15,7 +17,6 @@ struct cpudl {
struct cpudl_item *elements;
};
-#ifdef CONFIG_SMP
int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask);
void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
void cpudl_clear(struct cpudl *cp, int cpu);
@@ -23,4 +24,3 @@ int cpudl_init(struct cpudl *cp);
void cpudl_set_freecpu(struct cpudl *cp, int cpu);
void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
void cpudl_cleanup(struct cpudl *cp);
-#endif /* CONFIG_SMP */
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 5252fb191fae..742fb9e62e1a 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -5,6 +5,7 @@
* Copyright (C) 2016, Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
+#include "sched.h"
DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 1a19d69b91ed..0ab5f9d4bc59 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -5,6 +5,8 @@
* Copyright (C) 2016, Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
+#include <uapi/linux/sched/types.h>
+#include "sched.h"
#define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8)
@@ -81,9 +83,23 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
if (!cpufreq_this_cpu_can_update(sg_policy->policy))
return false;
- if (unlikely(sg_policy->limits_changed)) {
- sg_policy->limits_changed = false;
- sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
+ if (unlikely(READ_ONCE(sg_policy->limits_changed))) {
+ WRITE_ONCE(sg_policy->limits_changed, false);
+ sg_policy->need_freq_update = true;
+
+ /*
+ * The above limits_changed update must occur before the reads
+ * of policy limits in cpufreq_driver_resolve_freq() or a policy
+ * limits update might be missed, so use a memory barrier to
+ * ensure it.
+ *
+ * This pairs with the write memory barrier in sugov_limits().
+ */
+ smp_mb();
+
+ return true;
+ } else if (sg_policy->need_freq_update) {
+ /* ignore_dl_rate_limit() wants a new frequency to be found. */
return true;
}
@@ -95,10 +111,22 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
unsigned int next_freq)
{
- if (sg_policy->need_freq_update)
+ if (sg_policy->need_freq_update) {
sg_policy->need_freq_update = false;
- else if (sg_policy->next_freq == next_freq)
+ /*
+ * The policy limits have changed, but if the return value of
+ * cpufreq_driver_resolve_freq() after applying the new limits
+ * is still equal to the previously selected frequency, the
+ * driver callback need not be invoked unless the driver
+ * specifically wants that to happen on every update of the
+ * policy limits.
+ */
+ if (sg_policy->next_freq == next_freq &&
+ !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS))
+ return false;
+ } else if (sg_policy->next_freq == next_freq) {
return false;
+ }
sg_policy->next_freq = next_freq;
sg_policy->last_freq_update_time = time;
@@ -354,9 +382,9 @@ static bool sugov_hold_freq(struct sugov_cpu *sg_cpu)
sg_cpu->saved_idle_calls = idle_calls;
return ret;
}
-#else
+#else /* !CONFIG_NO_HZ_COMMON: */
static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; }
-#endif /* CONFIG_NO_HZ_COMMON */
+#endif /* !CONFIG_NO_HZ_COMMON */
/*
* Make sugov_should_update_freq() ignore the rate limit when DL
@@ -365,7 +393,7 @@ static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; }
static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
{
if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min)
- sg_cpu->sg_policy->limits_changed = true;
+ sg_cpu->sg_policy->need_freq_update = true;
}
static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
@@ -604,7 +632,7 @@ static const struct kobj_type sugov_tunables_ktype = {
/********************** cpufreq governor interface *********************/
-struct cpufreq_governor schedutil_gov;
+static struct cpufreq_governor schedutil_gov;
static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
{
@@ -871,10 +899,19 @@ static void sugov_limits(struct cpufreq_policy *policy)
mutex_unlock(&sg_policy->work_lock);
}
- sg_policy->limits_changed = true;
+ /*
+ * The limits_changed update below must take place before the updates
+ * of policy limits in cpufreq_set_policy() or a policy limits update
+ * might be missed, so use a memory barrier to ensure it.
+ *
+ * This pairs with the memory barrier in sugov_should_update_freq().
+ */
+ smp_wmb();
+
+ WRITE_ONCE(sg_policy->limits_changed, true);
}
-struct cpufreq_governor schedutil_gov = {
+static struct cpufreq_governor schedutil_gov = {
.name = "schedutil",
.owner = THIS_MODULE,
.flags = CPUFREQ_GOV_DYNAMIC_SWITCHING,
@@ -892,4 +929,9 @@ struct cpufreq_governor *cpufreq_default_governor(void)
}
#endif
+bool sugov_is_governor(struct cpufreq_policy *policy)
+{
+ return policy->governor == &schedutil_gov;
+}
+
cpufreq_governor_init(schedutil_gov);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 42c40cfdf836..76a9ac5eb794 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -22,6 +22,7 @@
* worst case complexity of O(min(101, nr_domcpus)), though the scenario that
* yields the worst case search is fairly contrived.
*/
+#include "sched.h"
/*
* p->rt_priority p->prio newpri cpupri
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index d6cba0020064..6f562088c056 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -1,4 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/atomic.h>
+#include <linux/cpumask.h>
+#include <linux/sched/rt.h>
#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1)
@@ -17,7 +20,6 @@ struct cpupri {
int *cpu_to_pri;
};
-#ifdef CONFIG_SMP
int cpupri_find(struct cpupri *cp, struct task_struct *p,
struct cpumask *lowest_mask);
int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
@@ -26,4 +28,3 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
void cpupri_set(struct cpupri *cp, int cpu, int pri);
int cpupri_init(struct cpupri *cp);
void cpupri_cleanup(struct cpupri *cp);
-#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 6dab4854c6c0..7097de2c8cda 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -2,6 +2,9 @@
/*
* Simple CPU accounting cgroup controller
*/
+#include <linux/sched/cputime.h>
+#include <linux/tsacct_kern.h>
+#include "sched.h"
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
#include <asm/cputime.h>
@@ -88,7 +91,7 @@ static u64 irqtime_tick_accounted(u64 maxtime)
return delta;
}
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */
static u64 irqtime_tick_accounted(u64 dummy)
{
@@ -241,7 +244,7 @@ void __account_forceidle_time(struct task_struct *p, u64 delta)
task_group_account_field(p, CPUTIME_FORCEIDLE, delta);
}
-#endif
+#endif /* CONFIG_SCHED_CORE */
/*
* When a guest is interrupted for a longer amount of time, missed clock
@@ -262,7 +265,7 @@ static __always_inline u64 steal_account_process_time(u64 maxtime)
return steal;
}
-#endif
+#endif /* CONFIG_PARAVIRT */
return 0;
}
@@ -288,7 +291,7 @@ static inline u64 read_sum_exec_runtime(struct task_struct *t)
{
return t->se.sum_exec_runtime;
}
-#else
+#else /* !CONFIG_64BIT: */
static u64 read_sum_exec_runtime(struct task_struct *t)
{
u64 ns;
@@ -301,7 +304,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t)
return ns;
}
-#endif
+#endif /* !CONFIG_64BIT */
/*
* Accumulate raw cputime values of dead tasks (sig->[us]time) and live
@@ -411,11 +414,11 @@ static void irqtime_account_idle_ticks(int ticks)
{
irqtime_account_process_tick(current, 0, ticks);
}
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */
static inline void irqtime_account_idle_ticks(int ticks) { }
static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
int nr_ticks) { }
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
/*
* Use precise platform statistics if available:
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index ad45a8fea245..e2d51f4306b3 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -17,6 +17,10 @@
*/
#include <linux/cpuset.h>
+#include <linux/sched/clock.h>
+#include <uapi/linux/sched/types.h>
+#include "sched.h"
+#include "pelt.h"
/*
* Default limits for DL period; on the top end we guard against small util
@@ -51,7 +55,7 @@ static int __init sched_dl_sysctl_init(void)
return 0;
}
late_initcall(sched_dl_sysctl_init);
-#endif
+#endif /* CONFIG_SYSCTL */
static bool dl_server(struct sched_dl_entity *dl_se)
{
@@ -99,7 +103,7 @@ static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
{
return pi_of(dl_se) != dl_se;
}
-#else
+#else /* !CONFIG_RT_MUTEXES: */
static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
{
return dl_se;
@@ -109,9 +113,8 @@ static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
{
return false;
}
-#endif
+#endif /* !CONFIG_RT_MUTEXES */
-#ifdef CONFIG_SMP
static inline struct dl_bw *dl_bw_of(int i)
{
RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
@@ -191,35 +194,6 @@ void __dl_update(struct dl_bw *dl_b, s64 bw)
rq->dl.extra_bw += bw;
}
}
-#else
-static inline struct dl_bw *dl_bw_of(int i)
-{
- return &cpu_rq(i)->dl.dl_bw;
-}
-
-static inline int dl_bw_cpus(int i)
-{
- return 1;
-}
-
-static inline unsigned long dl_bw_capacity(int i)
-{
- return SCHED_CAPACITY_SCALE;
-}
-
-bool dl_bw_visited(int cpu, u64 cookie)
-{
- return false;
-}
-
-static inline
-void __dl_update(struct dl_bw *dl_b, s64 bw)
-{
- struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
-
- dl->extra_bw += bw;
-}
-#endif
static inline
void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
@@ -552,23 +526,17 @@ void init_dl_rq(struct dl_rq *dl_rq)
{
dl_rq->root = RB_ROOT_CACHED;
-#ifdef CONFIG_SMP
/* zero means no -deadline tasks */
dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0;
dl_rq->overloaded = 0;
dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED;
-#else
- init_dl_bw(&dl_rq->dl_bw);
-#endif
dl_rq->running_bw = 0;
dl_rq->this_bw = 0;
init_dl_rq_bw_ratio(dl_rq);
}
-#ifdef CONFIG_SMP
-
static inline int dl_overloaded(struct rq *rq)
{
return atomic_read(&rq->rd->dlo_count);
@@ -753,37 +721,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
return later_rq;
}
-#else
-
-static inline
-void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
-{
-}
-
-static inline
-void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
-{
-}
-
-static inline
-void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
-{
-}
-
-static inline
-void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
-{
-}
-
-static inline void deadline_queue_push_tasks(struct rq *rq)
-{
-}
-
-static inline void deadline_queue_pull_task(struct rq *rq)
-{
-}
-#endif /* CONFIG_SMP */
-
static void
enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags);
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
@@ -824,6 +761,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
+ update_rq_clock(rq);
+
WARN_ON(is_dl_boosted(dl_se));
WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
@@ -1195,7 +1134,6 @@ static int start_dl_timer(struct sched_dl_entity *dl_se)
static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
{
-#ifdef CONFIG_SMP
/*
* Queueing this task back might have overloaded rq, check if we need
* to kick someone away.
@@ -1209,12 +1147,13 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
push_dl_task(rq);
rq_repin_lock(rq, rf);
}
-#endif
}
/* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */
static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC;
+static bool dl_server_stopped(struct sched_dl_entity *dl_se);
+
static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se)
{
struct rq *rq = rq_of_dl_se(dl_se);
@@ -1234,6 +1173,7 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_
if (!dl_se->server_has_tasks(dl_se)) {
replenish_dl_entity(dl_se);
+ dl_server_stopped(dl_se);
return HRTIMER_NORESTART;
}
@@ -1339,7 +1279,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
goto unlock;
}
-#ifdef CONFIG_SMP
if (unlikely(!rq->online)) {
/*
* If the runqueue is no longer available, migrate the
@@ -1356,7 +1295,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* there.
*/
}
-#endif
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (dl_task(rq->donor))
@@ -1504,7 +1442,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
if (dl_entity_is_special(dl_se))
return;
- scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec);
+ scaled_delta_exec = delta_exec;
+ if (!dl_server(dl_se))
+ scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec);
dl_se->runtime -= scaled_delta_exec;
@@ -1598,7 +1538,7 @@ throttle:
rt_rq->rt_time += delta_exec;
raw_spin_unlock(&rt_rq->rt_runtime_lock);
}
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
}
/*
@@ -1611,7 +1551,7 @@ throttle:
*/
void dl_server_update_idle_time(struct rq *rq, struct task_struct *p)
{
- s64 delta_exec, scaled_delta_exec;
+ s64 delta_exec;
if (!rq->fair_server.dl_defer)
return;
@@ -1624,9 +1564,7 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p)
if (delta_exec < 0)
return;
- scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec);
-
- rq->fair_server.runtime -= scaled_delta_exec;
+ rq->fair_server.runtime -= delta_exec;
if (rq->fair_server.runtime < 0) {
rq->fair_server.dl_defer_running = 0;
@@ -1639,31 +1577,17 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p)
void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
{
/* 0 runtime = fair server disabled */
- if (dl_se->dl_runtime)
+ if (dl_se->dl_runtime) {
+ dl_se->dl_server_idle = 0;
update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
+ }
}
void dl_server_start(struct sched_dl_entity *dl_se)
{
struct rq *rq = dl_se->rq;
- /*
- * XXX: the apply do not work fine at the init phase for the
- * fair server because things are not yet set. We need to improve
- * this before getting generic.
- */
- if (!dl_server(dl_se)) {
- u64 runtime = 50 * NSEC_PER_MSEC;
- u64 period = 1000 * NSEC_PER_MSEC;
-
- dl_server_apply_params(dl_se, runtime, period, 1);
-
- dl_se->dl_server = 1;
- dl_se->dl_defer = 1;
- setup_new_dl_entity(dl_se);
- }
-
- if (!dl_se->dl_runtime)
+ if (!dl_server(dl_se) || dl_se->dl_server_active)
return;
dl_se->dl_server_active = 1;
@@ -1674,7 +1598,7 @@ void dl_server_start(struct sched_dl_entity *dl_se)
void dl_server_stop(struct sched_dl_entity *dl_se)
{
- if (!dl_se->dl_runtime)
+ if (!dl_server(dl_se) || !dl_server_active(dl_se))
return;
dequeue_dl_entity(dl_se, DEQUEUE_SLEEP);
@@ -1684,6 +1608,20 @@ void dl_server_stop(struct sched_dl_entity *dl_se)
dl_se->dl_server_active = 0;
}
+static bool dl_server_stopped(struct sched_dl_entity *dl_se)
+{
+ if (!dl_se->dl_server_active)
+ return false;
+
+ if (dl_se->dl_server_idle) {
+ dl_server_stop(dl_se);
+ return true;
+ }
+
+ dl_se->dl_server_idle = 1;
+ return false;
+}
+
void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
dl_server_has_tasks_f has_tasks,
dl_server_pick_f pick_task)
@@ -1693,6 +1631,32 @@ void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
dl_se->server_pick_task = pick_task;
}
+void sched_init_dl_servers(void)
+{
+ int cpu;
+ struct rq *rq;
+ struct sched_dl_entity *dl_se;
+
+ for_each_online_cpu(cpu) {
+ u64 runtime = 50 * NSEC_PER_MSEC;
+ u64 period = 1000 * NSEC_PER_MSEC;
+
+ rq = cpu_rq(cpu);
+
+ guard(rq_lock_irq)(rq);
+
+ dl_se = &rq->fair_server;
+
+ WARN_ON(dl_server(dl_se));
+
+ dl_server_apply_params(dl_se, runtime, period, 1);
+
+ dl_se->dl_server = 1;
+ dl_se->dl_defer = 1;
+ setup_new_dl_entity(dl_se);
+ }
+}
+
void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq)
{
u64 new_bw = dl_se->dl_bw;
@@ -1844,8 +1808,6 @@ static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
#define __node_2_dle(node) \
rb_entry((node), struct sched_dl_entity, rb_node)
-#ifdef CONFIG_SMP
-
static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
{
struct rq *rq = rq_of_dl_rq(dl_rq);
@@ -1881,13 +1843,6 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
}
}
-#else
-
-static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
-static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
-
-#endif /* CONFIG_SMP */
-
static inline
void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
@@ -2166,6 +2121,9 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
if (dl_server(&p->dl))
return;
+ if (task_is_blocked(p))
+ return;
+
if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}
@@ -2214,8 +2172,6 @@ static void yield_task_dl(struct rq *rq)
rq_clock_skip_update(rq);
}
-#ifdef CONFIG_SMP
-
static inline bool dl_task_is_earliest_deadline(struct task_struct *p,
struct rq *rq)
{
@@ -2345,7 +2301,6 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
return sched_stop_runnable(rq) || sched_dl_runnable(rq);
}
-#endif /* CONFIG_SMP */
/*
* Only called when both the current and waking task are -deadline
@@ -2359,7 +2314,6 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
return;
}
-#ifdef CONFIG_SMP
/*
* In the unlikely case current and p have the same deadline
* let us try to decide what's the best thing to do...
@@ -2367,7 +2321,6 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
if ((p->dl.deadline == rq->donor->dl.deadline) &&
!test_tsk_need_resched(rq->curr))
check_preempt_equal_dl(rq, p);
-#endif /* CONFIG_SMP */
}
#ifdef CONFIG_SCHED_HRTICK
@@ -2375,11 +2328,11 @@ static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
{
hrtick_start(rq, dl_se->runtime);
}
-#else /* !CONFIG_SCHED_HRTICK */
+#else /* !CONFIG_SCHED_HRTICK: */
static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
{
}
-#endif
+#endif /* !CONFIG_SCHED_HRTICK */
static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
{
@@ -2435,7 +2388,7 @@ again:
if (dl_server(dl_se)) {
p = dl_se->server_pick_task(dl_se);
if (!p) {
- if (dl_server_active(dl_se)) {
+ if (!dl_server_stopped(dl_se)) {
dl_se->dl_yielded = 1;
update_curr_dl_se(rq, dl_se, 0);
}
@@ -2465,6 +2418,10 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_s
update_curr_dl(rq);
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
+
+ if (task_is_blocked(p))
+ return;
+
if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}
@@ -2500,8 +2457,6 @@ static void task_fork_dl(struct task_struct *p)
*/
}
-#ifdef CONFIG_SMP
-
/* Only try algorithms three times */
#define DL_MAX_TRIES 3
@@ -2976,7 +2931,14 @@ void dl_clear_root_domain(struct root_domain *rd)
int i;
guard(raw_spinlock_irqsave)(&rd->dl_bw.lock);
+
+ /*
+ * Reset total_bw to zero and extra_bw to max_bw so that next
+ * loop will add dl-servers contributions back properly,
+ */
rd->dl_bw.total_bw = 0;
+ for_each_cpu(i, rd->span)
+ cpu_rq(i)->dl.extra_bw = cpu_rq(i)->dl.max_bw;
/*
* dl_servers are not tasks. Since dl_add_task_root_domain ignores
@@ -2995,8 +2957,6 @@ void dl_clear_root_domain_cpu(int cpu)
dl_clear_root_domain(cpu_rq(cpu)->rd);
}
-#endif /* CONFIG_SMP */
-
static void switched_from_dl(struct rq *rq, struct task_struct *p)
{
/*
@@ -3069,10 +3029,8 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
}
if (rq->donor != p) {
-#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
deadline_queue_push_tasks(rq);
-#endif
if (dl_task(rq->donor))
wakeup_preempt_dl(rq, p, 0);
else
@@ -3092,7 +3050,6 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
if (!task_on_rq_queued(p))
return;
-#ifdef CONFIG_SMP
/*
* This might be too much, but unfortunately
* we don't have the old deadline value, and
@@ -3121,13 +3078,6 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
dl_time_before(p->dl.deadline, rq->curr->dl.deadline))
resched_curr(rq);
}
-#else
- /*
- * We don't know if p has a earlier or later deadline, so let's blindly
- * set a (maybe not needed) rescheduling point.
- */
- resched_curr(rq);
-#endif
}
#ifdef CONFIG_SCHED_CORE
@@ -3149,7 +3099,6 @@ DEFINE_SCHED_CLASS(dl) = {
.put_prev_task = put_prev_task_dl,
.set_next_task = set_next_task_dl,
-#ifdef CONFIG_SMP
.balance = balance_dl,
.select_task_rq = select_task_rq_dl,
.migrate_task_rq = migrate_task_rq_dl,
@@ -3158,7 +3107,6 @@ DEFINE_SCHED_CLASS(dl) = {
.rq_offline = rq_offline_dl,
.task_woken = task_woken_dl,
.find_lock_rq = find_lock_later_rq,
-#endif
.task_tick = task_tick_dl,
.task_fork = task_fork_dl,
@@ -3242,6 +3190,9 @@ void sched_dl_do_global(void)
if (global_rt_runtime() != RUNTIME_INF)
new_bw = to_ratio(global_rt_period(), global_rt_runtime());
+ for_each_possible_cpu(cpu)
+ init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl);
+
for_each_possible_cpu(cpu) {
rcu_read_lock_sched();
@@ -3257,7 +3208,6 @@ void sched_dl_do_global(void)
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
rcu_read_unlock_sched();
- init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl);
}
}
@@ -3458,7 +3408,6 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
return false;
}
-#ifdef CONFIG_SMP
int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
const struct cpumask *trial)
{
@@ -3570,7 +3519,6 @@ void dl_bw_free(int cpu, u64 dl_bw)
{
dl_bw_manage(dl_bw_req_free, cpu, dl_bw);
}
-#endif
void print_dl_stats(struct seq_file *m, int cpu)
{
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 56ae54e0ce6a..3f06ab84d53f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -6,6 +6,9 @@
*
* Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
*/
+#include <linux/debugfs.h>
+#include <linux/nmi.h>
+#include "sched.h"
/*
* This allows printing both to /sys/kernel/debug/sched/debug and
@@ -90,10 +93,10 @@ static void sched_feat_enable(int i)
{
static_key_enable_cpuslocked(&sched_feat_keys[i]);
}
-#else
+#else /* !CONFIG_JUMP_LABEL: */
static void sched_feat_disable(int i) { };
static void sched_feat_enable(int i) { };
-#endif /* CONFIG_JUMP_LABEL */
+#endif /* !CONFIG_JUMP_LABEL */
static int sched_feat_set(char *cmp)
{
@@ -166,8 +169,6 @@ static const struct file_operations sched_feat_fops = {
.release = single_release,
};
-#ifdef CONFIG_SMP
-
static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
@@ -214,8 +215,6 @@ static const struct file_operations sched_scaling_fops = {
.release = single_release,
};
-#endif /* SMP */
-
#ifdef CONFIG_PREEMPT_DYNAMIC
static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
@@ -283,7 +282,6 @@ static const struct file_operations sched_dynamic_fops = {
__read_mostly bool sched_debug_verbose;
-#ifdef CONFIG_SMP
static struct dentry *sd_dentry;
@@ -311,9 +309,6 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf,
return result;
}
-#else
-#define sched_verbose_write debugfs_write_file_bool
-#endif
static const struct file_operations sched_verbose_fops = {
.read = debugfs_read_file_bool,
@@ -512,7 +507,6 @@ static __init int sched_init_debug(void)
debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
-#ifdef CONFIG_SMP
debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
@@ -520,7 +514,6 @@ static __init int sched_init_debug(void)
sched_domains_mutex_lock();
update_sched_domain_debugfs();
sched_domains_mutex_unlock();
-#endif
#ifdef CONFIG_NUMA_BALANCING
numa = debugfs_create_dir("numa_balancing", debugfs_sched);
@@ -530,7 +523,7 @@ static __init int sched_init_debug(void)
debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max);
debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size);
debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
-#endif
+#endif /* CONFIG_NUMA_BALANCING */
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
@@ -540,8 +533,6 @@ static __init int sched_init_debug(void)
}
late_initcall(sched_init_debug);
-#ifdef CONFIG_SMP
-
static cpumask_var_t sd_sysctl_cpus;
static int sd_flags_show(struct seq_file *m, void *v)
@@ -588,6 +579,10 @@ static void register_sd(struct sched_domain *sd, struct dentry *parent)
debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops);
debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops);
debugfs_create_u32("level", 0444, parent, (u32 *)&sd->level);
+
+ if (sd->flags & SD_ASYM_PACKING)
+ debugfs_create_u32("group_asym_prefer_cpu", 0444, parent,
+ (u32 *)&sd->groups->asym_prefer_cpu);
}
void update_sched_domain_debugfs(void)
@@ -648,8 +643,6 @@ void dirty_sched_domain_sysctl(int cpu)
__cpumask_set_cpu(cpu, sd_sysctl_cpus);
}
-#endif /* CONFIG_SMP */
-
#ifdef CONFIG_FAIR_GROUP_SCHED
static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
{
@@ -686,18 +679,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
}
P(se->load.weight);
-#ifdef CONFIG_SMP
P(se->avg.load_avg);
P(se->avg.util_avg);
P(se->avg.runnable_avg);
-#endif
#undef PN_SCHEDSTAT
#undef PN
#undef P_SCHEDSTAT
#undef P
}
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_CGROUP_SCHED
static DEFINE_SPINLOCK(sched_debug_lock);
@@ -850,7 +841,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued);
SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
-#ifdef CONFIG_SMP
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
cfs_rq->avg.load_avg);
SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg",
@@ -870,8 +860,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->tg_load_avg_contrib);
SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
atomic_long_read(&cfs_rq->tg->load_avg));
-#endif
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_CFS_BANDWIDTH
SEQ_printf(m, " .%-30s: %d\n", "throttled",
cfs_rq->throttled);
@@ -925,11 +914,7 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x))
PU(dl_nr_running);
-#ifdef CONFIG_SMP
dl_bw = &cpu_rq(cpu)->rd->dl_bw;
-#else
- dl_bw = &dl_rq->dl_bw;
-#endif
SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw);
SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw);
@@ -947,9 +932,9 @@ static void print_cpu(struct seq_file *m, int cpu)
SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
cpu, freq / 1000, (freq % 1000));
}
-#else
+#else /* !CONFIG_X86: */
SEQ_printf(m, "cpu#%d\n", cpu);
-#endif
+#endif /* !CONFIG_X86 */
#define P(x) \
do { \
@@ -972,12 +957,10 @@ do { \
#undef P
#undef PN
-#ifdef CONFIG_SMP
#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
P64(avg_idle);
P64(max_idle_balance_cost);
#undef P64
-#endif
#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n));
if (schedstat_enabled()) {
@@ -1159,7 +1142,7 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
task_node(p), task_numa_group_id(p));
show_numa_stats(p, m);
-#endif
+#endif /* CONFIG_NUMA_BALANCING */
}
void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
@@ -1243,7 +1226,6 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
__PS("nr_involuntary_switches", p->nivcsw);
P(se.load.weight);
-#ifdef CONFIG_SMP
P(se.avg.load_sum);
P(se.avg.runnable_sum);
P(se.avg.util_sum);
@@ -1252,13 +1234,12 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P(se.avg.util_avg);
P(se.avg.last_update_time);
PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED);
-#endif
#ifdef CONFIG_UCLAMP_TASK
__PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
__PS("uclamp.max", p->uclamp_req[UCLAMP_MAX].value);
__PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN));
__PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX));
-#endif
+#endif /* CONFIG_UCLAMP_TASK */
P(policy);
P(prio);
if (task_has_dl_policy(p)) {
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 21575d39c376..7dedc9a16281 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -26,7 +26,7 @@ enum scx_consts {
* Iterating all tasks may take a while. Periodically drop
* scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
*/
- SCX_OPS_TASK_ITER_BATCH = 32,
+ SCX_TASK_ITER_BATCH = 32,
};
enum scx_exit_kind {
@@ -44,9 +44,9 @@ enum scx_exit_kind {
};
/*
- * An exit code can be specified when exiting with scx_bpf_exit() or
- * scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN
- * respectively. The codes are 64bit of the format:
+ * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(),
+ * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes
+ * are 64bit of the format:
*
* Bits: [63 .. 48 47 .. 32 31 .. 0]
* [ SYS ACT ] [ SYS RSN ] [ USR ]
@@ -163,16 +163,21 @@ enum scx_ops_flags {
/*
* CPU cgroup support flags
*/
- SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* cpu.weight */
+ SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */
- SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
- SCX_OPS_ENQ_LAST |
- SCX_OPS_ENQ_EXITING |
- SCX_OPS_ENQ_MIGRATION_DISABLED |
- SCX_OPS_ALLOW_QUEUED_WAKEUP |
- SCX_OPS_SWITCH_PARTIAL |
- SCX_OPS_BUILTIN_IDLE_PER_NODE |
- SCX_OPS_HAS_CGROUP_WEIGHT,
+ SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
+ SCX_OPS_ENQ_LAST |
+ SCX_OPS_ENQ_EXITING |
+ SCX_OPS_ENQ_MIGRATION_DISABLED |
+ SCX_OPS_ALLOW_QUEUED_WAKEUP |
+ SCX_OPS_SWITCH_PARTIAL |
+ SCX_OPS_BUILTIN_IDLE_PER_NODE |
+ SCX_OPS_HAS_CGROUP_WEIGHT,
+
+ /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
+ __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56,
+
+ SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56,
};
/* argument container for ops.init_task() */
@@ -198,6 +203,11 @@ struct scx_exit_task_args {
struct scx_cgroup_init_args {
/* the weight of the cgroup [1..10000] */
u32 weight;
+
+ /* bandwidth control parameters from cpu.max and cpu.max.burst */
+ u64 bw_period_us;
+ u64 bw_quota_us;
+ u64 bw_burst_us;
};
enum scx_cpu_preempt_reason {
@@ -368,6 +378,15 @@ struct sched_ext_ops {
* @running: A task is starting to run on its associated CPU
* @p: task starting to run
*
+ * Note that this callback may be called from a CPU other than the
+ * one the task is going to run on. This can happen when a task
+ * property is changed (i.e., affinity), since scx_next_task_scx(),
+ * which triggers this callback, may run on a CPU different from
+ * the task's assigned CPU.
+ *
+ * Therefore, always use scx_bpf_task_cpu(@p) to determine the
+ * target CPU the task is going to use.
+ *
* See ->runnable() for explanation on the task state notifiers.
*/
void (*running)(struct task_struct *p);
@@ -377,6 +396,15 @@ struct sched_ext_ops {
* @p: task stopping to run
* @runnable: is task @p still runnable?
*
+ * Note that this callback may be called from a CPU other than the
+ * one the task was running on. This can happen when a task
+ * property is changed (i.e., affinity), since dequeue_task_scx(),
+ * which triggers this callback, may run on a CPU different from
+ * the task's assigned CPU.
+ *
+ * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU
+ * the task was running on.
+ *
* See ->runnable() for explanation on the task state notifiers. If
* !@runnable, ->quiescent() will be invoked after this operation
* returns.
@@ -465,6 +493,7 @@ struct sched_ext_ops {
* idle CPU tracking and the following helpers become unavailable:
*
* - scx_bpf_select_cpu_dfl()
+ * - scx_bpf_select_cpu_and()
* - scx_bpf_test_and_clear_cpu_idle()
* - scx_bpf_pick_idle_cpu()
*
@@ -640,9 +669,31 @@ struct sched_ext_ops {
* @cgrp: cgroup whose weight is being updated
* @weight: new weight [1..10000]
*
- * Update @tg's weight to @weight.
+ * Update @cgrp's weight to @weight.
*/
void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
+
+ /**
+ * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed
+ * @cgrp: cgroup whose bandwidth is being updated
+ * @period_us: bandwidth control period
+ * @quota_us: bandwidth control quota
+ * @burst_us: bandwidth control burst
+ *
+ * Update @cgrp's bandwidth control parameters. This is from the cpu.max
+ * cgroup interface.
+ *
+ * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled
+ * to. For example, if @period_us is 1_000_000 and @quota_us is
+ * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
+ * interpreted in the same fashion and specifies how much @cgrp can
+ * burst temporarily. The specific control mechanism and thus the
+ * interpretation of @period_us and burstiness is upto to the BPF
+ * scheduler.
+ */
+ void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
+ u64 period_us, u64 quota_us, u64 burst_us);
+
#endif /* CONFIG_EXT_GROUP_SCHED */
/*
@@ -728,6 +779,9 @@ struct sched_ext_ops {
* BPF scheduler is enabled.
*/
char name[SCX_OPS_NAME_LEN];
+
+ /* internal use only, must be NULL */
+ void *priv;
};
enum scx_opi {
@@ -739,6 +793,98 @@ enum scx_opi {
SCX_OPI_END = SCX_OP_IDX(init),
};
+/*
+ * Collection of event counters. Event types are placed in descending order.
+ */
+struct scx_event_stats {
+ /*
+ * If ops.select_cpu() returns a CPU which can't be used by the task,
+ * the core scheduler code silently picks a fallback CPU.
+ */
+ s64 SCX_EV_SELECT_CPU_FALLBACK;
+
+ /*
+ * When dispatching to a local DSQ, the CPU may have gone offline in
+ * the meantime. In this case, the task is bounced to the global DSQ.
+ */
+ s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
+
+ /*
+ * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
+ * continued to run because there were no other tasks on the CPU.
+ */
+ s64 SCX_EV_DISPATCH_KEEP_LAST;
+
+ /*
+ * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
+ * is dispatched to a local DSQ when exiting.
+ */
+ s64 SCX_EV_ENQ_SKIP_EXITING;
+
+ /*
+ * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
+ * migration disabled task skips ops.enqueue() and is dispatched to its
+ * local DSQ.
+ */
+ s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
+
+ /*
+ * Total number of times a task's time slice was refilled with the
+ * default value (SCX_SLICE_DFL).
+ */
+ s64 SCX_EV_REFILL_SLICE_DFL;
+
+ /*
+ * The total duration of bypass modes in nanoseconds.
+ */
+ s64 SCX_EV_BYPASS_DURATION;
+
+ /*
+ * The number of tasks dispatched in the bypassing mode.
+ */
+ s64 SCX_EV_BYPASS_DISPATCH;
+
+ /*
+ * The number of times the bypassing mode has been activated.
+ */
+ s64 SCX_EV_BYPASS_ACTIVATE;
+};
+
+struct scx_sched {
+ struct sched_ext_ops ops;
+ DECLARE_BITMAP(has_op, SCX_OPI_END);
+
+ /*
+ * Dispatch queues.
+ *
+ * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability.
+ * This is to avoid live-locking in bypass mode where all tasks are
+ * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If
+ * per-node split isn't sufficient, it can be further split.
+ */
+ struct rhashtable dsq_hash;
+ struct scx_dispatch_q **global_dsqs;
+
+ /*
+ * The event counters are in a per-CPU variable to minimize the
+ * accounting overhead. A system-wide view on the event counter is
+ * constructed when requested by scx_bpf_events().
+ */
+ struct scx_event_stats __percpu *event_stats_cpu;
+
+ bool warned_zero_slice;
+
+ atomic_t exit_kind;
+ struct scx_exit_info *exit_info;
+
+ struct kobject kobj;
+
+ struct kthread_worker *helper;
+ struct irq_work error_irq_work;
+ struct kthread_work disable_work;
+ struct rcu_work rcu_work;
+};
+
enum scx_wake_flags {
/* expose select WF_* flags as enums */
SCX_WAKE_FORK = WF_FORK,
@@ -765,7 +911,7 @@ enum scx_enq_flags {
/*
* The task being enqueued was previously enqueued on the current CPU's
* %SCX_DSQ_LOCAL, but was removed from it in a call to the
- * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was
+ * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was
* invoked in a ->cpu_release() callback, and the task is again
* dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
* task will not be scheduled on the CPU until at least the next invocation
@@ -838,18 +984,18 @@ enum scx_tg_flags {
SCX_TG_INITED = 1U << 1,
};
-enum scx_ops_enable_state {
- SCX_OPS_ENABLING,
- SCX_OPS_ENABLED,
- SCX_OPS_DISABLING,
- SCX_OPS_DISABLED,
+enum scx_enable_state {
+ SCX_ENABLING,
+ SCX_ENABLED,
+ SCX_DISABLING,
+ SCX_DISABLED,
};
-static const char *scx_ops_enable_state_str[] = {
- [SCX_OPS_ENABLING] = "enabling",
- [SCX_OPS_ENABLED] = "enabled",
- [SCX_OPS_DISABLING] = "disabling",
- [SCX_OPS_DISABLED] = "disabled",
+static const char *scx_enable_state_str[] = {
+ [SCX_ENABLING] = "enabling",
+ [SCX_ENABLED] = "enabled",
+ [SCX_DISABLING] = "disabling",
+ [SCX_DISABLED] = "disabled",
};
/*
@@ -898,6 +1044,16 @@ enum scx_ops_state {
#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK)
/*
+ * NOTE: sched_ext is in the process of growing multiple scheduler support and
+ * scx_root usage is in a transitional state. Naked dereferences are safe if the
+ * caller is one of the tasks attached to SCX and explicit RCU dereference is
+ * necessary otherwise. Naked scx_root dereferences trigger sparse warnings but
+ * are used as temporary markers to indicate that the dereferences need to be
+ * updated to point to the associated scheduler instances rather than scx_root.
+ */
+static struct scx_sched __rcu *scx_root;
+
+/*
* During exit, a task may schedule after losing its PIDs. When disabling the
* BPF scheduler, we need to be able to iterate tasks in every state to
* guarantee system safety. Maintain a dedicated task list which contains every
@@ -907,33 +1063,17 @@ static DEFINE_SPINLOCK(scx_tasks_lock);
static LIST_HEAD(scx_tasks);
/* ops enable/disable */
-static struct kthread_worker *scx_ops_helper;
-static DEFINE_MUTEX(scx_ops_enable_mutex);
-DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
+static DEFINE_MUTEX(scx_enable_mutex);
+DEFINE_STATIC_KEY_FALSE(__scx_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
-static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
+static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED);
static unsigned long scx_in_softlockup;
-static atomic_t scx_ops_breather_depth = ATOMIC_INIT(0);
-static int scx_ops_bypass_depth;
-static bool scx_ops_init_task_enabled;
+static atomic_t scx_breather_depth = ATOMIC_INIT(0);
+static int scx_bypass_depth;
+static bool scx_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
-static struct sched_ext_ops scx_ops;
-static bool scx_warned_zero_slice;
-
-DEFINE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
-static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
-static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
-static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled);
-static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
-
-static struct static_key_false scx_has_op[SCX_OPI_END] =
- { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
-
-static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
-static struct scx_exit_info *scx_exit_info;
-
static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
@@ -947,7 +1087,7 @@ static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);
/*
* The maximum amount of time in jiffies that a task may be runnable without
* being scheduled on a CPU. If this timeout is exceeded, it will trigger
- * scx_ops_error().
+ * scx_error().
*/
static unsigned long scx_watchdog_timeout;
@@ -973,23 +1113,12 @@ static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
*/
static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
-/*
- * Dispatch queues.
- *
- * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is
- * to avoid live-locking in bypass mode where all tasks are dispatched to
- * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't
- * sufficient, it can be further split.
- */
-static struct scx_dispatch_q **global_dsqs;
-
static const struct rhashtable_params dsq_hash_params = {
.key_len = sizeof_field(struct scx_dispatch_q, id),
.key_offset = offsetof(struct scx_dispatch_q, id),
.head_offset = offsetof(struct scx_dispatch_q, hash_node),
};
-static struct rhashtable dsq_hash;
static LLIST_HEAD(dsqs_to_free);
/* dispatch buf */
@@ -1036,27 +1165,46 @@ static struct scx_dump_data scx_dump_data = {
/* /sys/kernel/sched_ext interface */
static struct kset *scx_kset;
-static struct kobject *scx_root_kobj;
#define CREATE_TRACE_POINTS
#include <trace/events/sched_ext.h>
static void process_ddsp_deferred_locals(struct rq *rq);
static void scx_bpf_kick_cpu(s32 cpu, u64 flags);
-static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
- s64 exit_code,
- const char *fmt, ...);
+static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
+ s64 exit_code, const char *fmt, va_list args);
+
+static __printf(4, 5) void scx_exit(struct scx_sched *sch,
+ enum scx_exit_kind kind, s64 exit_code,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ scx_vexit(sch, kind, exit_code, fmt, args);
+ va_end(args);
+}
-#define scx_ops_error_kind(err, fmt, args...) \
- scx_ops_exit_kind((err), 0, fmt, ##args)
+static __printf(3, 4) void scx_kf_exit(enum scx_exit_kind kind, s64 exit_code,
+ const char *fmt, ...)
+{
+ struct scx_sched *sch;
+ va_list args;
-#define scx_ops_exit(code, fmt, args...) \
- scx_ops_exit_kind(SCX_EXIT_UNREG_KERN, (code), fmt, ##args)
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (sch) {
+ va_start(args, fmt);
+ scx_vexit(sch, kind, exit_code, fmt, args);
+ va_end(args);
+ }
+ rcu_read_unlock();
+}
-#define scx_ops_error(fmt, args...) \
- scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args)
+#define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args)
+#define scx_kf_error(fmt, args...) scx_kf_exit(SCX_EXIT_ERROR, 0, fmt, ##args)
-#define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)])
+#define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op)
static long jiffies_delta_msecs(unsigned long at, unsigned long now)
{
@@ -1086,12 +1234,14 @@ static bool u32_before(u32 a, u32 b)
static struct scx_dispatch_q *find_global_dsq(struct task_struct *p)
{
- return global_dsqs[cpu_to_node(task_cpu(p))];
+ struct scx_sched *sch = scx_root;
+
+ return sch->global_dsqs[cpu_to_node(task_cpu(p))];
}
-static struct scx_dispatch_q *find_user_dsq(u64 dsq_id)
+static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id)
{
- return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params);
+ return rhashtable_lookup_fast(&sch->dsq_hash, &dsq_id, dsq_hash_params);
}
/*
@@ -1118,27 +1268,56 @@ static void scx_kf_disallow(u32 mask)
current->scx.kf_mask &= ~mask;
}
-#define SCX_CALL_OP(mask, op, args...) \
+/*
+ * Track the rq currently locked.
+ *
+ * This allows kfuncs to safely operate on rq from any scx ops callback,
+ * knowing which rq is already locked.
+ */
+DEFINE_PER_CPU(struct rq *, scx_locked_rq_state);
+
+static inline void update_locked_rq(struct rq *rq)
+{
+ /*
+ * Check whether @rq is actually locked. This can help expose bugs
+ * or incorrect assumptions about the context in which a kfunc or
+ * callback is executed.
+ */
+ if (rq)
+ lockdep_assert_rq_held(rq);
+ __this_cpu_write(scx_locked_rq_state, rq);
+}
+
+#define SCX_CALL_OP(sch, mask, op, rq, args...) \
do { \
+ if (rq) \
+ update_locked_rq(rq); \
if (mask) { \
scx_kf_allow(mask); \
- scx_ops.op(args); \
+ (sch)->ops.op(args); \
scx_kf_disallow(mask); \
} else { \
- scx_ops.op(args); \
+ (sch)->ops.op(args); \
} \
+ if (rq) \
+ update_locked_rq(NULL); \
} while (0)
-#define SCX_CALL_OP_RET(mask, op, args...) \
+#define SCX_CALL_OP_RET(sch, mask, op, rq, args...) \
({ \
- __typeof__(scx_ops.op(args)) __ret; \
+ __typeof__((sch)->ops.op(args)) __ret; \
+ \
+ if (rq) \
+ update_locked_rq(rq); \
if (mask) { \
scx_kf_allow(mask); \
- __ret = scx_ops.op(args); \
+ __ret = (sch)->ops.op(args); \
scx_kf_disallow(mask); \
} else { \
- __ret = scx_ops.op(args); \
+ __ret = (sch)->ops.op(args); \
} \
+ if (rq) \
+ update_locked_rq(NULL); \
__ret; \
})
@@ -1153,31 +1332,31 @@ do { \
* scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on
* the specific task.
*/
-#define SCX_CALL_OP_TASK(mask, op, task, args...) \
+#define SCX_CALL_OP_TASK(sch, mask, op, rq, task, args...) \
do { \
BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
current->scx.kf_tasks[0] = task; \
- SCX_CALL_OP(mask, op, task, ##args); \
+ SCX_CALL_OP((sch), mask, op, rq, task, ##args); \
current->scx.kf_tasks[0] = NULL; \
} while (0)
-#define SCX_CALL_OP_TASK_RET(mask, op, task, args...) \
+#define SCX_CALL_OP_TASK_RET(sch, mask, op, rq, task, args...) \
({ \
- __typeof__(scx_ops.op(task, ##args)) __ret; \
+ __typeof__((sch)->ops.op(task, ##args)) __ret; \
BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
current->scx.kf_tasks[0] = task; \
- __ret = SCX_CALL_OP_RET(mask, op, task, ##args); \
+ __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task, ##args); \
current->scx.kf_tasks[0] = NULL; \
__ret; \
})
-#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...) \
+#define SCX_CALL_OP_2TASKS_RET(sch, mask, op, rq, task0, task1, args...) \
({ \
- __typeof__(scx_ops.op(task0, task1, ##args)) __ret; \
+ __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \
BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
current->scx.kf_tasks[0] = task0; \
current->scx.kf_tasks[1] = task1; \
- __ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args); \
+ __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task0, task1, ##args); \
current->scx.kf_tasks[0] = NULL; \
current->scx.kf_tasks[1] = NULL; \
__ret; \
@@ -1187,8 +1366,8 @@ do { \
static __always_inline bool scx_kf_allowed(u32 mask)
{
if (unlikely(!(current->scx.kf_mask & mask))) {
- scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
- mask, current->scx.kf_mask);
+ scx_kf_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
+ mask, current->scx.kf_mask);
return false;
}
@@ -1201,13 +1380,13 @@ static __always_inline bool scx_kf_allowed(u32 mask)
*/
if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
(current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
- scx_ops_error("cpu_release kfunc called from a nested operation");
+ scx_kf_error("cpu_release kfunc called from a nested operation");
return false;
}
if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
(current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
- scx_ops_error("dispatch kfunc called from a nested operation");
+ scx_kf_error("dispatch kfunc called from a nested operation");
return false;
}
@@ -1223,18 +1402,13 @@ static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
if (unlikely((p != current->scx.kf_tasks[0] &&
p != current->scx.kf_tasks[1]))) {
- scx_ops_error("called on a task not being operated on");
+ scx_kf_error("called on a task not being operated on");
return false;
}
return true;
}
-static bool scx_kf_allowed_if_unlocked(void)
-{
- return !current->scx.kf_mask;
-}
-
/**
* nldsq_next_task - Iterate to the next task in a non-local DSQ
* @dsq: user dsq being iterated
@@ -1402,15 +1576,15 @@ static void scx_task_iter_stop(struct scx_task_iter *iter)
* @iter: iterator to walk
*
* Visit the next task. See scx_task_iter_start() for details. Locks are dropped
- * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing
- * stalls by holding scx_tasks_lock for too long.
+ * and re-acquired every %SCX_TASK_ITER_BATCH iterations to avoid causing stalls
+ * by holding scx_tasks_lock for too long.
*/
static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
{
struct list_head *cursor = &iter->cursor.tasks_node;
struct sched_ext_entity *pos;
- if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) {
+ if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) {
scx_task_iter_unlock(iter);
cond_resched();
scx_task_iter_relock(iter);
@@ -1481,91 +1655,29 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
return p;
}
-/*
- * Collection of event counters. Event types are placed in descending order.
- */
-struct scx_event_stats {
- /*
- * If ops.select_cpu() returns a CPU which can't be used by the task,
- * the core scheduler code silently picks a fallback CPU.
- */
- s64 SCX_EV_SELECT_CPU_FALLBACK;
-
- /*
- * When dispatching to a local DSQ, the CPU may have gone offline in
- * the meantime. In this case, the task is bounced to the global DSQ.
- */
- s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
-
- /*
- * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
- * continued to run because there were no other tasks on the CPU.
- */
- s64 SCX_EV_DISPATCH_KEEP_LAST;
-
- /*
- * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
- * is dispatched to a local DSQ when exiting.
- */
- s64 SCX_EV_ENQ_SKIP_EXITING;
-
- /*
- * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
- * migration disabled task skips ops.enqueue() and is dispatched to its
- * local DSQ.
- */
- s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
-
- /*
- * The total number of tasks enqueued (or pick_task-ed) with a
- * default time slice (SCX_SLICE_DFL).
- */
- s64 SCX_EV_ENQ_SLICE_DFL;
-
- /*
- * The total duration of bypass modes in nanoseconds.
- */
- s64 SCX_EV_BYPASS_DURATION;
-
- /*
- * The number of tasks dispatched in the bypassing mode.
- */
- s64 SCX_EV_BYPASS_DISPATCH;
-
- /*
- * The number of times the bypassing mode has been activated.
- */
- s64 SCX_EV_BYPASS_ACTIVATE;
-};
-
-/*
- * The event counter is organized by a per-CPU variable to minimize the
- * accounting overhead without synchronization. A system-wide view on the
- * event counter is constructed when requested by scx_bpf_get_event_stat().
- */
-static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
-
/**
* scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @sch: scx_sched to account events for
* @name: an event name defined in struct scx_event_stats
- * @cnt: the number of the event occured
+ * @cnt: the number of the event occurred
*
* This can be used when preemption is not disabled.
*/
-#define scx_add_event(name, cnt) do { \
- this_cpu_add(event_stats_cpu.name, cnt); \
- trace_sched_ext_event(#name, cnt); \
+#define scx_add_event(sch, name, cnt) do { \
+ this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \
+ trace_sched_ext_event(#name, (cnt)); \
} while(0)
/**
* __scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @sch: scx_sched to account events for
* @name: an event name defined in struct scx_event_stats
- * @cnt: the number of the event occured
+ * @cnt: the number of the event occurred
*
* This should be used only when preemption is disabled.
*/
-#define __scx_add_event(name, cnt) do { \
- __this_cpu_add(event_stats_cpu.name, cnt); \
+#define __scx_add_event(sch, name, cnt) do { \
+ __this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \
trace_sched_ext_event(#name, cnt); \
} while(0)
@@ -1590,30 +1702,25 @@ static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
} while (0)
-static void scx_bpf_events(struct scx_event_stats *events, size_t events__sz);
+static void scx_read_events(struct scx_sched *sch,
+ struct scx_event_stats *events);
-static enum scx_ops_enable_state scx_ops_enable_state(void)
+static enum scx_enable_state scx_enable_state(void)
{
- return atomic_read(&scx_ops_enable_state_var);
+ return atomic_read(&scx_enable_state_var);
}
-static enum scx_ops_enable_state
-scx_ops_set_enable_state(enum scx_ops_enable_state to)
+static enum scx_enable_state scx_set_enable_state(enum scx_enable_state to)
{
- return atomic_xchg(&scx_ops_enable_state_var, to);
+ return atomic_xchg(&scx_enable_state_var, to);
}
-static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to,
- enum scx_ops_enable_state from)
+static bool scx_tryset_enable_state(enum scx_enable_state to,
+ enum scx_enable_state from)
{
int from_v = from;
- return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to);
-}
-
-static bool scx_rq_bypassing(struct rq *rq)
-{
- return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
+ return atomic_try_cmpxchg(&scx_enable_state_var, &from_v, to);
}
/**
@@ -1633,8 +1740,14 @@ static void wait_ops_state(struct task_struct *p, unsigned long opss)
} while (atomic_long_read_acquire(&p->scx.ops_state) == opss);
}
+static inline bool __cpu_valid(s32 cpu)
+{
+ return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu));
+}
+
/**
- * ops_cpu_valid - Verify a cpu number
+ * ops_cpu_valid - Verify a cpu number, to be used on ops input args
+ * @sch: scx_sched to abort on error
* @cpu: cpu number which came from a BPF ops
* @where: extra information reported on error
*
@@ -1642,35 +1755,52 @@ static void wait_ops_state(struct task_struct *p, unsigned long opss)
* Verify that it is in range and one of the possible cpus. If invalid, trigger
* an ops error.
*/
-static bool ops_cpu_valid(s32 cpu, const char *where)
+static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where)
{
- if (likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu))) {
+ if (__cpu_valid(cpu)) {
return true;
} else {
- scx_ops_error("invalid CPU %d%s%s", cpu,
- where ? " " : "", where ?: "");
+ scx_error(sch, "invalid CPU %d%s%s", cpu, where ? " " : "", where ?: "");
+ return false;
+ }
+}
+
+/**
+ * kf_cpu_valid - Verify a CPU number, to be used on kfunc input args
+ * @cpu: cpu number which came from a BPF ops
+ * @where: extra information reported on error
+ *
+ * The same as ops_cpu_valid() but @sch is implicit.
+ */
+static bool kf_cpu_valid(u32 cpu, const char *where)
+{
+ if (__cpu_valid(cpu)) {
+ return true;
+ } else {
+ scx_kf_error("invalid CPU %d%s%s", cpu, where ? " " : "", where ?: "");
return false;
}
}
/**
* ops_sanitize_err - Sanitize a -errno value
+ * @sch: scx_sched to error out on error
* @ops_name: operation to blame on failure
* @err: -errno value to sanitize
*
- * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return
+ * Verify @err is a valid -errno. If not, trigger scx_error() and return
* -%EPROTO. This is necessary because returning a rogue -errno up the chain can
* cause misbehaviors. For an example, a large negative return from
* ops.init_task() triggers an oops when passed up the call chain because the
* value fails IS_ERR() test after being encoded with ERR_PTR() and then is
* handled as a pointer.
*/
-static int ops_sanitize_err(const char *ops_name, s32 err)
+static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err)
{
if (err < 0 && err >= -MAX_ERRNO)
return err;
- scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err);
+ scx_error(sch, "ops.%s() returned an invalid errno %d", ops_name, err);
return -EPROTO;
}
@@ -1679,12 +1809,10 @@ static void run_deferred(struct rq *rq)
process_ddsp_deferred_locals(rq);
}
-#ifdef CONFIG_SMP
static void deferred_bal_cb_workfn(struct rq *rq)
{
run_deferred(rq);
}
-#endif
static void deferred_irq_workfn(struct irq_work *irq_work)
{
@@ -1707,7 +1835,6 @@ static void schedule_deferred(struct rq *rq)
{
lockdep_assert_rq_held(rq);
-#ifdef CONFIG_SMP
/*
* If in the middle of waking up a task, task_woken_scx() will be called
* afterwards which will then run the deferred actions, no need to
@@ -1725,7 +1852,7 @@ static void schedule_deferred(struct rq *rq)
deferred_bal_cb_workfn);
return;
}
-#endif
+
/*
* No scheduler hooks available. Queue an irq work. They are executed on
* IRQ re-enable which may take a bit longer than the scheduler hooks.
@@ -1777,7 +1904,7 @@ static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p)
lockdep_assert_rq_held(rq);
#ifdef CONFIG_SCHED_CORE
- if (SCX_HAS_OP(core_sched_before))
+ if (unlikely(SCX_HAS_OP(scx_root, core_sched_before)))
touch_core_sched(rq, p);
#endif
}
@@ -1815,8 +1942,14 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
WRITE_ONCE(dsq->nr, dsq->nr + delta);
}
-static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
- u64 enq_flags)
+static void refill_task_slice_dfl(struct task_struct *p)
+{
+ p->scx.slice = SCX_SLICE_DFL;
+ __scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1);
+}
+
+static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
+ struct task_struct *p, u64 enq_flags)
{
bool is_local = dsq->id == SCX_DSQ_LOCAL;
@@ -1827,7 +1960,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
if (!is_local) {
raw_spin_lock(&dsq->lock);
if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
- scx_ops_error("attempting to dispatch to a destroyed dsq");
+ scx_error(sch, "attempting to dispatch to a destroyed dsq");
/* fall back to the global dsq */
raw_spin_unlock(&dsq->lock);
dsq = find_global_dsq(p);
@@ -1844,7 +1977,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
* disallow any internal DSQ from doing vtime ordering of
* tasks.
*/
- scx_ops_error("cannot use vtime ordering for built-in DSQs");
+ scx_error(sch, "cannot use vtime ordering for built-in DSQs");
enq_flags &= ~SCX_ENQ_DSQ_PRIQ;
}
@@ -1858,8 +1991,8 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
*/
if (unlikely(RB_EMPTY_ROOT(&dsq->priq) &&
nldsq_next_task(dsq, NULL, false)))
- scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks",
- dsq->id);
+ scx_error(sch, "DSQ ID 0x%016llx already had FIFO-enqueued tasks",
+ dsq->id);
p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ;
rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less);
@@ -1880,8 +2013,8 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
} else {
/* a FIFO DSQ shouldn't be using PRIQ enqueuing */
if (unlikely(!RB_EMPTY_ROOT(&dsq->priq)))
- scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
- dsq->id);
+ scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
+ dsq->id);
if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
list_add(&p->scx.dsq_list.node, &dsq->list);
@@ -1996,7 +2129,8 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
raw_spin_unlock(&dsq->lock);
}
-static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
+static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch,
+ struct rq *rq, u64 dsq_id,
struct task_struct *p)
{
struct scx_dispatch_q *dsq;
@@ -2007,7 +2141,7 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
- if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
+ if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
return find_global_dsq(p);
return &cpu_rq(cpu)->scx.local_dsq;
@@ -2016,11 +2150,11 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
if (dsq_id == SCX_DSQ_GLOBAL)
dsq = find_global_dsq(p);
else
- dsq = find_user_dsq(dsq_id);
+ dsq = find_user_dsq(sch, dsq_id);
if (unlikely(!dsq)) {
- scx_ops_error("non-existent DSQ 0x%llx for %s[%d]",
- dsq_id, p->comm, p->pid);
+ scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]",
+ dsq_id, p->comm, p->pid);
return find_global_dsq(p);
}
@@ -2041,12 +2175,12 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,
/* @p must match the task on the enqueue path */
if (unlikely(p != ddsp_task)) {
if (IS_ERR(ddsp_task))
- scx_ops_error("%s[%d] already direct-dispatched",
- p->comm, p->pid);
+ scx_kf_error("%s[%d] already direct-dispatched",
+ p->comm, p->pid);
else
- scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
- ddsp_task->comm, ddsp_task->pid,
- p->comm, p->pid);
+ scx_kf_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
+ ddsp_task->comm, ddsp_task->pid,
+ p->comm, p->pid);
return;
}
@@ -2057,11 +2191,12 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,
p->scx.ddsp_enq_flags = enq_flags;
}
-static void direct_dispatch(struct task_struct *p, u64 enq_flags)
+static void direct_dispatch(struct scx_sched *sch, struct task_struct *p,
+ u64 enq_flags)
{
struct rq *rq = task_rq(p);
struct scx_dispatch_q *dsq =
- find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);
+ find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p);
touch_core_sched_dispatch(rq, p);
@@ -2102,7 +2237,8 @@ static void direct_dispatch(struct task_struct *p, u64 enq_flags)
return;
}
- dispatch_enqueue(dsq, p, p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
+ dispatch_enqueue(sch, dsq, p,
+ p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
}
static bool scx_rq_online(struct rq *rq)
@@ -2120,6 +2256,7 @@ static bool scx_rq_online(struct rq *rq)
static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
int sticky_cpu)
{
+ struct scx_sched *sch = scx_root;
struct task_struct **ddsp_taskp;
unsigned long qseq;
@@ -2138,7 +2275,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
goto local;
if (scx_rq_bypassing(rq)) {
- __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
+ __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
goto global;
}
@@ -2146,20 +2283,20 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
goto direct;
/* see %SCX_OPS_ENQ_EXITING */
- if (!static_branch_unlikely(&scx_ops_enq_exiting) &&
+ if (!(sch->ops.flags & SCX_OPS_ENQ_EXITING) &&
unlikely(p->flags & PF_EXITING)) {
- __scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1);
+ __scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1);
goto local;
}
/* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
- if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) &&
+ if (!(sch->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) &&
is_migration_disabled(p)) {
- __scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
+ __scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
goto local;
}
- if (!SCX_HAS_OP(enqueue))
+ if (unlikely(!SCX_HAS_OP(sch, enqueue)))
goto global;
/* DSQ bypass didn't trigger, enqueue on the BPF scheduler */
@@ -2172,7 +2309,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
WARN_ON_ONCE(*ddsp_taskp);
*ddsp_taskp = p;
- SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags);
+ SCX_CALL_OP_TASK(sch, SCX_KF_ENQUEUE, enqueue, rq, p, enq_flags);
*ddsp_taskp = NULL;
if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
@@ -2186,7 +2323,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
return;
direct:
- direct_dispatch(p, enq_flags);
+ direct_dispatch(sch, p, enq_flags);
return;
local:
@@ -2196,17 +2333,15 @@ local:
* higher priority it becomes from scx_prio_less()'s POV.
*/
touch_core_sched(rq, p);
- p->scx.slice = SCX_SLICE_DFL;
- __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
+ refill_task_slice_dfl(p);
local_norefill:
- dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
+ dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags);
return;
global:
touch_core_sched(rq, p); /* see the comment in local: */
- p->scx.slice = SCX_SLICE_DFL;
- __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
- dispatch_enqueue(find_global_dsq(p), p, enq_flags);
+ refill_task_slice_dfl(p);
+ dispatch_enqueue(sch, find_global_dsq(p), p, enq_flags);
}
static bool task_runnable(const struct task_struct *p)
@@ -2224,7 +2359,7 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p)
}
/*
- * list_add_tail() must be used. scx_ops_bypass() depends on tasks being
+ * list_add_tail() must be used. scx_bypass() depends on tasks being
* appended to the runnable_list.
*/
list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
@@ -2239,6 +2374,7 @@ static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at)
static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
{
+ struct scx_sched *sch = scx_root;
int sticky_cpu = p->scx.sticky_cpu;
if (enq_flags & ENQUEUE_WAKEUP)
@@ -2268,8 +2404,8 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
rq->scx.nr_running++;
add_nr_running(rq, 1);
- if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p))
- SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags);
+ if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, runnable, rq, p, enq_flags);
if (enq_flags & SCX_ENQ_WAKEUP)
touch_core_sched(rq, p);
@@ -2280,11 +2416,12 @@ out:
if ((enq_flags & SCX_ENQ_CPU_SELECTED) &&
unlikely(cpu_of(rq) != p->scx.selected_cpu))
- __scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1);
+ __scx_add_event(sch, SCX_EV_SELECT_CPU_FALLBACK, 1);
}
-static void ops_dequeue(struct task_struct *p, u64 deq_flags)
+static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
{
+ struct scx_sched *sch = scx_root;
unsigned long opss;
/* dequeue is always temporary, don't reset runnable_at */
@@ -2303,8 +2440,9 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
*/
BUG();
case SCX_OPSS_QUEUED:
- if (SCX_HAS_OP(dequeue))
- SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags);
+ if (SCX_HAS_OP(sch, dequeue))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
+ p, deq_flags);
if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
SCX_OPSS_NONE))
@@ -2332,12 +2470,14 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
{
+ struct scx_sched *sch = scx_root;
+
if (!(p->scx.flags & SCX_TASK_QUEUED)) {
WARN_ON_ONCE(task_runnable(p));
return true;
}
- ops_dequeue(p, deq_flags);
+ ops_dequeue(rq, p, deq_flags);
/*
* A currently running task which is going off @rq first gets dequeued
@@ -2351,13 +2491,13 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
* information meaningful to the BPF scheduler and can be suppressed by
* skipping the callbacks if the task is !QUEUED.
*/
- if (SCX_HAS_OP(stopping) && task_current(rq, p)) {
+ if (SCX_HAS_OP(sch, stopping) && task_current(rq, p)) {
update_curr_scx(rq);
- SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false);
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, false);
}
- if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p))
- SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags);
+ if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, quiescent, rq, p, deq_flags);
if (deq_flags & SCX_DEQ_SLEEP)
p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
@@ -2374,20 +2514,23 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
static void yield_task_scx(struct rq *rq)
{
+ struct scx_sched *sch = scx_root;
struct task_struct *p = rq->curr;
- if (SCX_HAS_OP(yield))
- SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL);
+ if (SCX_HAS_OP(sch, yield))
+ SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL);
else
p->scx.slice = 0;
}
static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
{
+ struct scx_sched *sch = scx_root;
struct task_struct *from = rq->curr;
- if (SCX_HAS_OP(yield))
- return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to);
+ if (SCX_HAS_OP(sch, yield))
+ return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq,
+ from, to);
else
return false;
}
@@ -2413,7 +2556,6 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
p->scx.dsq = dst_dsq;
}
-#ifdef CONFIG_SMP
/**
* move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ
* @p: task to move
@@ -2467,7 +2609,8 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
*
* The caller must ensure that @p and @rq are on different CPUs.
*/
-static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
+static bool task_can_run_on_remote_rq(struct scx_sched *sch,
+ struct task_struct *p, struct rq *rq,
bool enforce)
{
int cpu = cpu_of(rq);
@@ -2488,8 +2631,8 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
*/
if (unlikely(is_migration_disabled(p))) {
if (enforce)
- scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d",
- p->comm, p->pid, task_cpu(p), cpu);
+ scx_error(sch, "SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d",
+ p->comm, p->pid, task_cpu(p), cpu);
return false;
}
@@ -2501,14 +2644,15 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
*/
if (!task_allowed_on_cpu(p, cpu)) {
if (enforce)
- scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]",
- cpu, p->comm, p->pid);
+ scx_error(sch, "SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]",
+ cpu, p->comm, p->pid);
return false;
}
if (!scx_rq_online(rq)) {
if (enforce)
- __scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
+ __scx_add_event(scx_root,
+ SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
return false;
}
@@ -2578,14 +2722,10 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p,
return false;
}
}
-#else /* CONFIG_SMP */
-static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); }
-static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool enforce) { return false; }
-static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; }
-#endif /* CONFIG_SMP */
/**
* move_task_between_dsqs() - Move a task from one DSQ to another
+ * @sch: scx_sched being operated on
* @p: target task
* @enq_flags: %SCX_ENQ_*
* @src_dsq: DSQ @p is currently on, must not be a local DSQ
@@ -2599,7 +2739,8 @@ static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p
* On return, @src_dsq is unlocked and only @p's new task_rq, which is the
* return value, is locked.
*/
-static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
+static struct rq *move_task_between_dsqs(struct scx_sched *sch,
+ struct task_struct *p, u64 enq_flags,
struct scx_dispatch_q *src_dsq,
struct scx_dispatch_q *dst_dsq)
{
@@ -2612,7 +2753,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
if (dst_dsq->id == SCX_DSQ_LOCAL) {
dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
if (src_rq != dst_rq &&
- unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
+ unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
dst_dsq = find_global_dsq(p);
dst_rq = src_rq;
}
@@ -2646,7 +2787,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
p->scx.dsq = NULL;
raw_spin_unlock(&src_dsq->lock);
- dispatch_enqueue(dst_dsq, p, enq_flags);
+ dispatch_enqueue(sch, dst_dsq, p, enq_flags);
}
return dst_rq;
@@ -2658,13 +2799,13 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
* to the bypass mode can take a long time. Inject artificial delays while the
* bypass mode is switching to guarantee timely completion.
*/
-static void scx_ops_breather(struct rq *rq)
+static void scx_breather(struct rq *rq)
{
u64 until;
lockdep_assert_rq_held(rq);
- if (likely(!atomic_read(&scx_ops_breather_depth)))
+ if (likely(!atomic_read(&scx_breather_depth)))
return;
raw_spin_rq_unlock(rq);
@@ -2673,25 +2814,26 @@ static void scx_ops_breather(struct rq *rq)
do {
int cnt = 1024;
- while (atomic_read(&scx_ops_breather_depth) && --cnt)
+ while (atomic_read(&scx_breather_depth) && --cnt)
cpu_relax();
- } while (atomic_read(&scx_ops_breather_depth) &&
+ } while (atomic_read(&scx_breather_depth) &&
time_before64(ktime_get_ns(), until));
raw_spin_rq_lock(rq);
}
-static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq)
+static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq,
+ struct scx_dispatch_q *dsq)
{
struct task_struct *p;
retry:
/*
- * This retry loop can repeatedly race against scx_ops_bypass()
- * dequeueing tasks from @dsq trying to put the system into the bypass
- * mode. On some multi-socket machines (e.g. 2x Intel 8480c), this can
- * live-lock the machine into soft lockups. Give a breather.
+ * This retry loop can repeatedly race against scx_bypass() dequeueing
+ * tasks from @dsq trying to put the system into the bypass mode. On
+ * some multi-socket machines (e.g. 2x Intel 8480c), this can live-lock
+ * the machine into soft lockups. Give a breather.
*/
- scx_ops_breather(rq);
+ scx_breather(rq);
/*
* The caller can't expect to successfully consume a task if the task's
@@ -2713,7 +2855,7 @@ retry:
return true;
}
- if (task_can_run_on_remote_rq(p, rq, false)) {
+ if (task_can_run_on_remote_rq(sch, p, rq, false)) {
if (likely(consume_remote_task(rq, p, dsq, task_rq)))
return true;
goto retry;
@@ -2724,15 +2866,16 @@ retry:
return false;
}
-static bool consume_global_dsq(struct rq *rq)
+static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq)
{
int node = cpu_to_node(cpu_of(rq));
- return consume_dispatch_q(rq, global_dsqs[node]);
+ return consume_dispatch_q(sch, rq, sch->global_dsqs[node]);
}
/**
* dispatch_to_local_dsq - Dispatch a task to a local dsq
+ * @sch: scx_sched being operated on
* @rq: current rq which is locked
* @dst_dsq: destination DSQ
* @p: task to dispatch
@@ -2745,14 +2888,13 @@ static bool consume_global_dsq(struct rq *rq)
* The caller must have exclusive ownership of @p (e.g. through
* %SCX_OPSS_DISPATCHING).
*/
-static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
+static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
+ struct scx_dispatch_q *dst_dsq,
struct task_struct *p, u64 enq_flags)
{
struct rq *src_rq = task_rq(p);
struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
-#ifdef CONFIG_SMP
struct rq *locked_rq = rq;
-#endif
/*
* We're synchronized against dequeue through DISPATCHING. As @p can't
@@ -2761,14 +2903,14 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
* If dispatching to @rq that @p is already on, no lock dancing needed.
*/
if (rq == src_rq && rq == dst_rq) {
- dispatch_enqueue(dst_dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
+ dispatch_enqueue(sch, dst_dsq, p,
+ enq_flags | SCX_ENQ_CLEAR_OPSS);
return;
}
-#ifdef CONFIG_SMP
if (src_rq != dst_rq &&
- unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
- dispatch_enqueue(find_global_dsq(p), p,
+ unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
+ dispatch_enqueue(sch, find_global_dsq(p), p,
enq_flags | SCX_ENQ_CLEAR_OPSS);
return;
}
@@ -2806,7 +2948,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
*/
if (src_rq == dst_rq) {
p->scx.holding_cpu = -1;
- dispatch_enqueue(&dst_rq->scx.local_dsq, p, enq_flags);
+ dispatch_enqueue(sch, &dst_rq->scx.local_dsq, p,
+ enq_flags);
} else {
move_remote_task_to_local_dsq(p, enq_flags,
src_rq, dst_rq);
@@ -2824,9 +2967,6 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
raw_spin_rq_unlock(locked_rq);
raw_spin_rq_lock(rq);
}
-#else /* CONFIG_SMP */
- BUG(); /* control can not reach here on UP */
-#endif /* CONFIG_SMP */
}
/**
@@ -2848,7 +2988,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
* was valid in the first place. Make sure that the task is still owned by the
* BPF scheduler and claim the ownership before dispatching.
*/
-static void finish_dispatch(struct rq *rq, struct task_struct *p,
+static void finish_dispatch(struct scx_sched *sch, struct rq *rq,
+ struct task_struct *p,
unsigned long qseq_at_dispatch,
u64 dsq_id, u64 enq_flags)
{
@@ -2901,15 +3042,15 @@ retry:
BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));
- dsq = find_dsq_for_dispatch(this_rq(), dsq_id, p);
+ dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, p);
if (dsq->id == SCX_DSQ_LOCAL)
- dispatch_to_local_dsq(rq, dsq, p, enq_flags);
+ dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags);
else
- dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
+ dispatch_enqueue(sch, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
}
-static void flush_dispatch_buf(struct rq *rq)
+static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq)
{
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
u32 u;
@@ -2917,7 +3058,7 @@ static void flush_dispatch_buf(struct rq *rq)
for (u = 0; u < dspc->cursor; u++) {
struct scx_dsp_buf_ent *ent = &dspc->buf[u];
- finish_dispatch(rq, ent->task, ent->qseq, ent->dsq_id,
+ finish_dispatch(sch, rq, ent->task, ent->qseq, ent->dsq_id,
ent->enq_flags);
}
@@ -2927,6 +3068,7 @@ static void flush_dispatch_buf(struct rq *rq)
static int balance_one(struct rq *rq, struct task_struct *prev)
{
+ struct scx_sched *sch = scx_root;
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
bool prev_on_scx = prev->sched_class == &ext_sched_class;
bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED;
@@ -2936,7 +3078,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
rq->scx.flags |= SCX_RQ_IN_BALANCE;
rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
- if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
+ if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) &&
unlikely(rq->scx.cpu_released)) {
/*
* If the previous sched_class for the current CPU was not SCX,
@@ -2944,8 +3086,9 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* core. This callback complements ->cpu_release(), which is
* emitted in switch_class().
*/
- if (SCX_HAS_OP(cpu_acquire))
- SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL);
+ if (SCX_HAS_OP(sch, cpu_acquire))
+ SCX_CALL_OP(sch, SCX_KF_REST, cpu_acquire, rq,
+ cpu_of(rq), NULL);
rq->scx.cpu_released = false;
}
@@ -2959,8 +3102,8 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* scheduler wants to handle this explicitly, it should
* implement ->cpu_release().
*
- * See scx_ops_disable_workfn() for the explanation on the
- * bypassing test.
+ * See scx_disable_workfn() for the explanation on the bypassing
+ * test.
*/
if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
@@ -2972,10 +3115,11 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
if (rq->scx.local_dsq.nr)
goto has_tasks;
- if (consume_global_dsq(rq))
+ if (consume_global_dsq(sch, rq))
goto has_tasks;
- if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq))
+ if (unlikely(!SCX_HAS_OP(sch, dispatch)) ||
+ scx_rq_bypassing(rq) || !scx_rq_online(rq))
goto no_tasks;
dspc->rq = rq;
@@ -2990,10 +3134,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
do {
dspc->nr_tasks = 0;
- SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq),
- prev_on_scx ? prev : NULL);
+ SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq,
+ cpu_of(rq), prev_on_scx ? prev : NULL);
- flush_dispatch_buf(rq);
+ flush_dispatch_buf(sch, rq);
if (prev_on_rq && prev->scx.slice) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
@@ -3001,7 +3145,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
}
if (rq->scx.local_dsq.nr)
goto has_tasks;
- if (consume_global_dsq(rq))
+ if (consume_global_dsq(sch, rq))
goto has_tasks;
/*
@@ -3024,10 +3168,10 @@ no_tasks:
* Didn't find another task to run. Keep running @prev unless
* %SCX_OPS_ENQ_LAST is in effect.
*/
- if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) ||
- scx_rq_bypassing(rq))) {
+ if (prev_on_rq &&
+ (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
- __scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1);
+ __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1);
goto has_tasks;
}
rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
@@ -3087,32 +3231,36 @@ static void process_ddsp_deferred_locals(struct rq *rq)
*/
while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,
struct task_struct, scx.dsq_list.node))) {
+ struct scx_sched *sch = scx_root;
struct scx_dispatch_q *dsq;
list_del_init(&p->scx.dsq_list.node);
- dsq = find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);
+ dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p);
if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
- dispatch_to_local_dsq(rq, dsq, p, p->scx.ddsp_enq_flags);
+ dispatch_to_local_dsq(sch, rq, dsq, p,
+ p->scx.ddsp_enq_flags);
}
}
static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
{
+ struct scx_sched *sch = scx_root;
+
if (p->scx.flags & SCX_TASK_QUEUED) {
/*
* Core-sched might decide to execute @p before it is
* dispatched. Call ops_dequeue() to notify the BPF scheduler.
*/
- ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC);
+ ops_dequeue(rq, p, SCX_DEQ_CORE_SCHED_EXEC);
dispatch_dequeue(rq, p);
}
p->se.exec_start = rq_clock_task(rq);
/* see dequeue_task_scx() on why we skip when !QUEUED */
- if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED))
- SCX_CALL_OP_TASK(SCX_KF_REST, running, p);
+ if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, running, rq, p);
clr_task_runnable(p, true);
@@ -3142,10 +3290,8 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
static enum scx_cpu_preempt_reason
preempt_reason_from_class(const struct sched_class *class)
{
-#ifdef CONFIG_SMP
if (class == &stop_sched_class)
return SCX_CPU_PREEMPT_STOP;
-#endif
if (class == &dl_sched_class)
return SCX_CPU_PREEMPT_DL;
if (class == &rt_sched_class)
@@ -3155,17 +3301,16 @@ preempt_reason_from_class(const struct sched_class *class)
static void switch_class(struct rq *rq, struct task_struct *next)
{
+ struct scx_sched *sch = scx_root;
const struct sched_class *next_class = next->sched_class;
-#ifdef CONFIG_SMP
/*
* Pairs with the smp_load_acquire() issued by a CPU in
* kick_cpus_irq_workfn() who is waiting for this CPU to perform a
* resched.
*/
smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
-#endif
- if (!static_branch_unlikely(&scx_ops_cpu_preempt))
+ if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT))
return;
/*
@@ -3187,14 +3332,14 @@ static void switch_class(struct rq *rq, struct task_struct *next)
* next time that balance_scx() is invoked.
*/
if (!rq->scx.cpu_released) {
- if (SCX_HAS_OP(cpu_release)) {
+ if (SCX_HAS_OP(sch, cpu_release)) {
struct scx_cpu_release_args args = {
.reason = preempt_reason_from_class(next_class),
.task = next,
};
- SCX_CALL_OP(SCX_KF_CPU_RELEASE,
- cpu_release, cpu_of(rq), &args);
+ SCX_CALL_OP(sch, SCX_KF_CPU_RELEASE, cpu_release, rq,
+ cpu_of(rq), &args);
}
rq->scx.cpu_released = true;
}
@@ -3203,11 +3348,12 @@ static void switch_class(struct rq *rq, struct task_struct *next)
static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
struct task_struct *next)
{
+ struct scx_sched *sch = scx_root;
update_curr_scx(rq);
/* see dequeue_task_scx() on why we skip when !QUEUED */
- if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))
- SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true);
+ if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, true);
if (p->scx.flags & SCX_TASK_QUEUED) {
set_task_runnable(rq, p);
@@ -3219,7 +3365,8 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
* DSQ.
*/
if (p->scx.slice && !scx_rq_bypassing(rq)) {
- dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
+ dispatch_enqueue(sch, &rq->scx.local_dsq, p,
+ SCX_ENQ_HEAD);
goto switch_class;
}
@@ -3230,7 +3377,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
* which should trigger an explicit follow-up scheduling event.
*/
if (sched_class_above(&ext_sched_class, next->sched_class)) {
- WARN_ON_ONCE(!static_branch_unlikely(&scx_ops_enq_last));
+ WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST));
do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);
} else {
do_enqueue_task(rq, p, 0, -1);
@@ -3283,7 +3430,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
* Can happen while enabling as SCX_RQ_BAL_PENDING assertion is
* conditional on scx_enabled() and may have been skipped.
*/
- WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED);
+ WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED);
keep_prev = false;
}
@@ -3294,10 +3441,8 @@ static struct task_struct *pick_task_scx(struct rq *rq)
*/
if (keep_prev) {
p = prev;
- if (!p->scx.slice) {
- p->scx.slice = SCX_SLICE_DFL;
- __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
- }
+ if (!p->scx.slice)
+ refill_task_slice_dfl(p);
} else {
p = first_local_task(rq);
if (!p) {
@@ -3307,13 +3452,14 @@ static struct task_struct *pick_task_scx(struct rq *rq)
}
if (unlikely(!p->scx.slice)) {
- if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
+ struct scx_sched *sch = scx_root;
+
+ if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) {
printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n",
p->comm, p->pid, __func__);
- scx_warned_zero_slice = true;
+ sch->warned_zero_slice = true;
}
- p->scx.slice = SCX_SLICE_DFL;
- __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
+ refill_task_slice_dfl(p);
}
}
@@ -3342,13 +3488,17 @@ static struct task_struct *pick_task_scx(struct rq *rq)
bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
bool in_fi)
{
+ struct scx_sched *sch = scx_root;
+
/*
* The const qualifiers are dropped from task_struct pointers when
* calling ops.core_sched_before(). Accesses are controlled by the
* verifier.
*/
- if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a)))
- return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before,
+ if (SCX_HAS_OP(sch, core_sched_before) &&
+ !scx_rq_bypassing(task_rq(a)))
+ return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, core_sched_before,
+ NULL,
(struct task_struct *)a,
(struct task_struct *)b);
else
@@ -3356,10 +3506,9 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
}
#endif /* CONFIG_SCHED_CORE */
-#ifdef CONFIG_SMP
-
static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
{
+ struct scx_sched *sch = scx_root;
bool rq_bypass;
/*
@@ -3376,7 +3525,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
return prev_cpu;
rq_bypass = scx_rq_bypassing(task_rq(p));
- if (SCX_HAS_OP(select_cpu) && !rq_bypass) {
+ if (likely(SCX_HAS_OP(sch, select_cpu)) && !rq_bypass) {
s32 cpu;
struct task_struct **ddsp_taskp;
@@ -3384,29 +3533,30 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
WARN_ON_ONCE(*ddsp_taskp);
*ddsp_taskp = p;
- cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
- select_cpu, p, prev_cpu, wake_flags);
+ cpu = SCX_CALL_OP_TASK_RET(sch,
+ SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
+ select_cpu, NULL, p, prev_cpu,
+ wake_flags);
p->scx.selected_cpu = cpu;
*ddsp_taskp = NULL;
- if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
+ if (ops_cpu_valid(sch, cpu, "from ops.select_cpu()"))
return cpu;
else
return prev_cpu;
} else {
s32 cpu;
- cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0);
+ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0);
if (cpu >= 0) {
- p->scx.slice = SCX_SLICE_DFL;
+ refill_task_slice_dfl(p);
p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
- __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
} else {
cpu = prev_cpu;
}
p->scx.selected_cpu = cpu;
if (rq_bypass)
- __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
+ __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
return cpu;
}
}
@@ -3419,6 +3569,8 @@ static void task_woken_scx(struct rq *rq, struct task_struct *p)
static void set_cpus_allowed_scx(struct task_struct *p,
struct affinity_context *ac)
{
+ struct scx_sched *sch = scx_root;
+
set_cpus_allowed_common(p, ac);
/*
@@ -3429,28 +3581,38 @@ static void set_cpus_allowed_scx(struct task_struct *p,
* Fine-grained memory write control is enforced by BPF making the const
* designation pointless. Cast it away when calling the operation.
*/
- if (SCX_HAS_OP(set_cpumask))
- SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
- (struct cpumask *)p->cpus_ptr);
+ if (SCX_HAS_OP(sch, set_cpumask))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, NULL,
+ p, (struct cpumask *)p->cpus_ptr);
}
static void handle_hotplug(struct rq *rq, bool online)
{
+ struct scx_sched *sch = scx_root;
int cpu = cpu_of(rq);
atomic_long_inc(&scx_hotplug_seq);
+ /*
+ * scx_root updates are protected by cpus_read_lock() and will stay
+ * stable here. Note that we can't depend on scx_enabled() test as the
+ * hotplug ops need to be enabled before __scx_enabled is set.
+ */
+ if (unlikely(!sch))
+ return;
+
if (scx_enabled())
- scx_idle_update_selcpu_topology(&scx_ops);
+ scx_idle_update_selcpu_topology(&sch->ops);
- if (online && SCX_HAS_OP(cpu_online))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
- else if (!online && SCX_HAS_OP(cpu_offline))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, cpu);
+ if (online && SCX_HAS_OP(sch, cpu_online))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_online, NULL, cpu);
+ else if (!online && SCX_HAS_OP(sch, cpu_offline))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_offline, NULL, cpu);
else
- scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
- "cpu %d going %s, exiting scheduler", cpu,
- online ? "online" : "offline");
+ scx_exit(sch, SCX_EXIT_UNREG_KERN,
+ SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
+ "cpu %d going %s, exiting scheduler", cpu,
+ online ? "online" : "offline");
}
void scx_rq_activate(struct rq *rq)
@@ -3473,15 +3635,19 @@ static void rq_offline_scx(struct rq *rq)
rq->scx.flags &= ~SCX_RQ_ONLINE;
}
-#endif /* CONFIG_SMP */
static bool check_rq_for_timeouts(struct rq *rq)
{
+ struct scx_sched *sch;
struct task_struct *p;
struct rq_flags rf;
bool timed_out = false;
rq_lock_irqsave(rq, &rf);
+ sch = rcu_dereference_bh(scx_root);
+ if (unlikely(!sch))
+ goto out_unlock;
+
list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
unsigned long last_runnable = p->scx.runnable_at;
@@ -3489,16 +3655,15 @@ static bool check_rq_for_timeouts(struct rq *rq)
last_runnable + scx_watchdog_timeout))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
- scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
- "%s[%d] failed to run for %u.%03us",
- p->comm, p->pid,
- dur_ms / 1000, dur_ms % 1000);
+ scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
+ "%s[%d] failed to run for %u.%03us",
+ p->comm, p->pid, dur_ms / 1000, dur_ms % 1000);
timed_out = true;
break;
}
}
+out_unlock:
rq_unlock_irqrestore(rq, &rf);
-
return timed_out;
}
@@ -3520,19 +3685,24 @@ static void scx_watchdog_workfn(struct work_struct *work)
void scx_tick(struct rq *rq)
{
+ struct scx_sched *sch;
unsigned long last_check;
if (!scx_enabled())
return;
+ sch = rcu_dereference_bh(scx_root);
+ if (unlikely(!sch))
+ return;
+
last_check = READ_ONCE(scx_watchdog_timestamp);
if (unlikely(time_after(jiffies,
last_check + READ_ONCE(scx_watchdog_timeout)))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
- scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
- "watchdog failed to check in for %u.%03us",
- dur_ms / 1000, dur_ms % 1000);
+ scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
+ "watchdog failed to check in for %u.%03us",
+ dur_ms / 1000, dur_ms % 1000);
}
update_other_load_avgs(rq);
@@ -3540,6 +3710,8 @@ void scx_tick(struct rq *rq)
static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
{
+ struct scx_sched *sch = scx_root;
+
update_curr_scx(rq);
/*
@@ -3549,8 +3721,8 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
if (scx_rq_bypassing(rq)) {
curr->scx.slice = 0;
touch_core_sched(rq, curr);
- } else if (SCX_HAS_OP(tick)) {
- SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr);
+ } else if (SCX_HAS_OP(sch, tick)) {
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, tick, rq, curr);
}
if (!curr->scx.slice)
@@ -3615,21 +3787,23 @@ static void scx_set_task_state(struct task_struct *p, enum scx_task_state state)
p->scx.flags |= state << SCX_TASK_STATE_SHIFT;
}
-static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool fork)
+static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork)
{
+ struct scx_sched *sch = scx_root;
int ret;
p->scx.disallow = false;
- if (SCX_HAS_OP(init_task)) {
+ if (SCX_HAS_OP(sch, init_task)) {
struct scx_init_task_args args = {
SCX_INIT_TASK_ARGS_CGROUP(tg)
.fork = fork,
};
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, p, &args);
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init_task, NULL,
+ p, &args);
if (unlikely(ret)) {
- ret = ops_sanitize_err("init_task", ret);
+ ret = ops_sanitize_err(sch, "init_task", ret);
return ret;
}
}
@@ -3657,8 +3831,8 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool
task_rq_unlock(rq, p, &rf);
} else if (p->policy == SCHED_EXT) {
- scx_ops_error("ops.init_task() set task->scx.disallow for %s[%d] during fork",
- p->comm, p->pid);
+ scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork",
+ p->comm, p->pid);
}
}
@@ -3666,11 +3840,13 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool
return 0;
}
-static void scx_ops_enable_task(struct task_struct *p)
+static void scx_enable_task(struct task_struct *p)
{
+ struct scx_sched *sch = scx_root;
+ struct rq *rq = task_rq(p);
u32 weight;
- lockdep_assert_rq_held(task_rq(p));
+ lockdep_assert_rq_held(rq);
/*
* Set the weight before calling ops.enable() so that the scheduler
@@ -3683,26 +3859,31 @@ static void scx_ops_enable_task(struct task_struct *p)
p->scx.weight = sched_weight_to_cgroup(weight);
- if (SCX_HAS_OP(enable))
- SCX_CALL_OP_TASK(SCX_KF_REST, enable, p);
+ if (SCX_HAS_OP(sch, enable))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, enable, rq, p);
scx_set_task_state(p, SCX_TASK_ENABLED);
- if (SCX_HAS_OP(set_weight))
- SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
+ if (SCX_HAS_OP(sch, set_weight))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq,
+ p, p->scx.weight);
}
-static void scx_ops_disable_task(struct task_struct *p)
+static void scx_disable_task(struct task_struct *p)
{
- lockdep_assert_rq_held(task_rq(p));
+ struct scx_sched *sch = scx_root;
+ struct rq *rq = task_rq(p);
+
+ lockdep_assert_rq_held(rq);
WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
- if (SCX_HAS_OP(disable))
- SCX_CALL_OP_TASK(SCX_KF_REST, disable, p);
+ if (SCX_HAS_OP(sch, disable))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, disable, rq, p);
scx_set_task_state(p, SCX_TASK_READY);
}
-static void scx_ops_exit_task(struct task_struct *p)
+static void scx_exit_task(struct task_struct *p)
{
+ struct scx_sched *sch = scx_root;
struct scx_exit_task_args args = {
.cancelled = false,
};
@@ -3718,15 +3899,16 @@ static void scx_ops_exit_task(struct task_struct *p)
case SCX_TASK_READY:
break;
case SCX_TASK_ENABLED:
- scx_ops_disable_task(p);
+ scx_disable_task(p);
break;
default:
WARN_ON_ONCE(true);
return;
}
- if (SCX_HAS_OP(exit_task))
- SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args);
+ if (SCX_HAS_OP(sch, exit_task))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, task_rq(p),
+ p, &args);
scx_set_task_state(p, SCX_TASK_NONE);
}
@@ -3758,15 +3940,15 @@ int scx_fork(struct task_struct *p)
{
percpu_rwsem_assert_held(&scx_fork_rwsem);
- if (scx_ops_init_task_enabled)
- return scx_ops_init_task(p, task_group(p), true);
+ if (scx_init_task_enabled)
+ return scx_init_task(p, task_group(p), true);
else
return 0;
}
void scx_post_fork(struct task_struct *p)
{
- if (scx_ops_init_task_enabled) {
+ if (scx_init_task_enabled) {
scx_set_task_state(p, SCX_TASK_READY);
/*
@@ -3779,7 +3961,7 @@ void scx_post_fork(struct task_struct *p)
struct rq *rq;
rq = task_rq_lock(p, &rf);
- scx_ops_enable_task(p);
+ scx_enable_task(p);
task_rq_unlock(rq, p, &rf);
}
}
@@ -3799,7 +3981,7 @@ void scx_cancel_fork(struct task_struct *p)
rq = task_rq_lock(p, &rf);
WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY);
- scx_ops_exit_task(p);
+ scx_exit_task(p);
task_rq_unlock(rq, p, &rf);
}
@@ -3815,15 +3997,15 @@ void sched_ext_free(struct task_struct *p)
spin_unlock_irqrestore(&scx_tasks_lock, flags);
/*
- * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY ->
- * ENABLED transitions can't race us. Disable ops for @p.
+ * @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED
+ * transitions can't race us. Disable ops for @p.
*/
if (scx_get_task_state(p) != SCX_TASK_NONE) {
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(p, &rf);
- scx_ops_exit_task(p);
+ scx_exit_task(p);
task_rq_unlock(rq, p, &rf);
}
}
@@ -3831,11 +4013,14 @@ void sched_ext_free(struct task_struct *p)
static void reweight_task_scx(struct rq *rq, struct task_struct *p,
const struct load_weight *lw)
{
+ struct scx_sched *sch = scx_root;
+
lockdep_assert_rq_held(task_rq(p));
p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
- if (SCX_HAS_OP(set_weight))
- SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
+ if (SCX_HAS_OP(sch, set_weight))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq,
+ p, p->scx.weight);
}
static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
@@ -3844,20 +4029,22 @@ static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
static void switching_to_scx(struct rq *rq, struct task_struct *p)
{
- scx_ops_enable_task(p);
+ struct scx_sched *sch = scx_root;
+
+ scx_enable_task(p);
/*
* set_cpus_allowed_scx() is not called while @p is associated with a
* different scheduler class. Keep the BPF scheduler up-to-date.
*/
- if (SCX_HAS_OP(set_cpumask))
- SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
- (struct cpumask *)p->cpus_ptr);
+ if (SCX_HAS_OP(sch, set_cpumask))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, rq,
+ p, (struct cpumask *)p->cpus_ptr);
}
static void switched_from_scx(struct rq *rq, struct task_struct *p)
{
- scx_ops_disable_task(p);
+ scx_disable_task(p);
}
static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
@@ -3899,60 +4086,40 @@ bool scx_can_stop_tick(struct rq *rq)
DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem);
static bool scx_cgroup_enabled;
-static bool cgroup_warned_missing_weight;
-static bool cgroup_warned_missing_idle;
-
-static void scx_cgroup_warn_missing_weight(struct task_group *tg)
-{
- if (scx_ops_enable_state() == SCX_OPS_DISABLED ||
- cgroup_warned_missing_weight)
- return;
-
- if ((scx_ops.flags & SCX_OPS_HAS_CGROUP_WEIGHT) || !tg->css.parent)
- return;
-
- pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.weight\n",
- scx_ops.name);
- cgroup_warned_missing_weight = true;
-}
-static void scx_cgroup_warn_missing_idle(struct task_group *tg)
+void scx_tg_init(struct task_group *tg)
{
- if (!scx_cgroup_enabled || cgroup_warned_missing_idle)
- return;
-
- if (!tg->idle)
- return;
-
- pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.idle\n",
- scx_ops.name);
- cgroup_warned_missing_idle = true;
+ tg->scx.weight = CGROUP_WEIGHT_DFL;
+ tg->scx.bw_period_us = default_bw_period_us();
+ tg->scx.bw_quota_us = RUNTIME_INF;
}
int scx_tg_online(struct task_group *tg)
{
+ struct scx_sched *sch = scx_root;
int ret = 0;
- WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED));
+ WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED));
percpu_down_read(&scx_cgroup_rwsem);
- scx_cgroup_warn_missing_weight(tg);
-
if (scx_cgroup_enabled) {
- if (SCX_HAS_OP(cgroup_init)) {
+ if (SCX_HAS_OP(sch, cgroup_init)) {
struct scx_cgroup_init_args args =
- { .weight = tg->scx_weight };
+ { .weight = tg->scx.weight,
+ .bw_period_us = tg->scx.bw_period_us,
+ .bw_quota_us = tg->scx.bw_quota_us,
+ .bw_burst_us = tg->scx.bw_burst_us };
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
- tg->css.cgroup, &args);
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init,
+ NULL, tg->css.cgroup, &args);
if (ret)
- ret = ops_sanitize_err("cgroup_init", ret);
+ ret = ops_sanitize_err(sch, "cgroup_init", ret);
}
if (ret == 0)
- tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED;
+ tg->scx.flags |= SCX_TG_ONLINE | SCX_TG_INITED;
} else {
- tg->scx_flags |= SCX_TG_ONLINE;
+ tg->scx.flags |= SCX_TG_ONLINE;
}
percpu_up_read(&scx_cgroup_rwsem);
@@ -3961,19 +4128,24 @@ int scx_tg_online(struct task_group *tg)
void scx_tg_offline(struct task_group *tg)
{
- WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE));
+ struct scx_sched *sch = scx_root;
+
+ WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE));
percpu_down_read(&scx_cgroup_rwsem);
- if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup);
- tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
+ if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) &&
+ (tg->scx.flags & SCX_TG_INITED))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
+ tg->css.cgroup);
+ tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
percpu_up_read(&scx_cgroup_rwsem);
}
int scx_cgroup_can_attach(struct cgroup_taskset *tset)
{
+ struct scx_sched *sch = scx_root;
struct cgroup_subsys_state *css;
struct task_struct *p;
int ret;
@@ -3998,8 +4170,9 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
if (from == to)
continue;
- if (SCX_HAS_OP(cgroup_prep_move)) {
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move,
+ if (SCX_HAS_OP(sch, cgroup_prep_move)) {
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED,
+ cgroup_prep_move, NULL,
p, from, css->cgroup);
if (ret)
goto err;
@@ -4012,18 +4185,21 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
err:
cgroup_taskset_for_each(p, css, tset) {
- if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
- p->scx.cgrp_moving_from, css->cgroup);
+ if (SCX_HAS_OP(sch, cgroup_cancel_move) &&
+ p->scx.cgrp_moving_from)
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
+ p, p->scx.cgrp_moving_from, css->cgroup);
p->scx.cgrp_moving_from = NULL;
}
percpu_up_read(&scx_cgroup_rwsem);
- return ops_sanitize_err("cgroup_prep_move", ret);
+ return ops_sanitize_err(sch, "cgroup_prep_move", ret);
}
void scx_cgroup_move_task(struct task_struct *p)
{
+ struct scx_sched *sch = scx_root;
+
if (!scx_cgroup_enabled)
return;
@@ -4031,9 +4207,11 @@ void scx_cgroup_move_task(struct task_struct *p)
* @p must have ops.cgroup_prep_move() called on it and thus
* cgrp_moving_from set.
*/
- if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
- SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p,
- p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
+ if (SCX_HAS_OP(sch, cgroup_move) &&
+ !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
+ SCX_CALL_OP_TASK(sch, SCX_KF_UNLOCKED, cgroup_move, NULL,
+ p, p->scx.cgrp_moving_from,
+ tg_cgrp(task_group(p)));
p->scx.cgrp_moving_from = NULL;
}
@@ -4044,6 +4222,7 @@ void scx_cgroup_finish_attach(void)
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
{
+ struct scx_sched *sch = scx_root;
struct cgroup_subsys_state *css;
struct task_struct *p;
@@ -4051,9 +4230,10 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
goto out_unlock;
cgroup_taskset_for_each(p, css, tset) {
- if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
- p->scx.cgrp_moving_from, css->cgroup);
+ if (SCX_HAS_OP(sch, cgroup_cancel_move) &&
+ p->scx.cgrp_moving_from)
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
+ p, p->scx.cgrp_moving_from, css->cgroup);
p->scx.cgrp_moving_from = NULL;
}
out_unlock:
@@ -4062,22 +4242,43 @@ out_unlock:
void scx_group_set_weight(struct task_group *tg, unsigned long weight)
{
+ struct scx_sched *sch = scx_root;
+
percpu_down_read(&scx_cgroup_rwsem);
- if (scx_cgroup_enabled && tg->scx_weight != weight) {
- if (SCX_HAS_OP(cgroup_set_weight))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight,
- tg_cgrp(tg), weight);
- tg->scx_weight = weight;
- }
+ if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) &&
+ tg->scx.weight != weight)
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL,
+ tg_cgrp(tg), weight);
+
+ tg->scx.weight = weight;
percpu_up_read(&scx_cgroup_rwsem);
}
void scx_group_set_idle(struct task_group *tg, bool idle)
{
+ /* TODO: Implement ops->cgroup_set_idle() */
+}
+
+void scx_group_set_bandwidth(struct task_group *tg,
+ u64 period_us, u64 quota_us, u64 burst_us)
+{
+ struct scx_sched *sch = scx_root;
+
percpu_down_read(&scx_cgroup_rwsem);
- scx_cgroup_warn_missing_idle(tg);
+
+ if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) &&
+ (tg->scx.bw_period_us != period_us ||
+ tg->scx.bw_quota_us != quota_us ||
+ tg->scx.bw_burst_us != burst_us))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_bandwidth, NULL,
+ tg_cgrp(tg), period_us, quota_us, burst_us);
+
+ tg->scx.bw_period_us = period_us;
+ tg->scx.bw_quota_us = quota_us;
+ tg->scx.bw_burst_us = burst_us;
+
percpu_up_read(&scx_cgroup_rwsem);
}
@@ -4124,14 +4325,12 @@ DEFINE_SCHED_CLASS(ext) = {
.put_prev_task = put_prev_task_scx,
.set_next_task = set_next_task_scx,
-#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_scx,
.task_woken = task_woken_scx,
.set_cpus_allowed = set_cpus_allowed_scx,
.rq_online = rq_online_scx,
.rq_offline = rq_offline_scx,
-#endif
.task_tick = task_tick_scx,
@@ -4157,29 +4356,6 @@ static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
dsq->id = dsq_id;
}
-static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node)
-{
- struct scx_dispatch_q *dsq;
- int ret;
-
- if (dsq_id & SCX_DSQ_FLAG_BUILTIN)
- return ERR_PTR(-EINVAL);
-
- dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node);
- if (!dsq)
- return ERR_PTR(-ENOMEM);
-
- init_dsq(dsq, dsq_id);
-
- ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node,
- dsq_hash_params);
- if (ret) {
- kfree(dsq);
- return ERR_PTR(ret);
- }
- return dsq;
-}
-
static void free_dsq_irq_workfn(struct irq_work *irq_work)
{
struct llist_node *to_free = llist_del_all(&dsqs_to_free);
@@ -4191,26 +4367,27 @@ static void free_dsq_irq_workfn(struct irq_work *irq_work)
static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn);
-static void destroy_dsq(u64 dsq_id)
+static void destroy_dsq(struct scx_sched *sch, u64 dsq_id)
{
struct scx_dispatch_q *dsq;
unsigned long flags;
rcu_read_lock();
- dsq = find_user_dsq(dsq_id);
+ dsq = find_user_dsq(sch, dsq_id);
if (!dsq)
goto out_unlock_rcu;
raw_spin_lock_irqsave(&dsq->lock, flags);
if (dsq->nr) {
- scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)",
- dsq->id, dsq->nr);
+ scx_error(sch, "attempting to destroy in-use dsq 0x%016llx (nr=%u)",
+ dsq->id, dsq->nr);
goto out_unlock_dsq;
}
- if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params))
+ if (rhashtable_remove_fast(&sch->dsq_hash, &dsq->hash_node,
+ dsq_hash_params))
goto out_unlock_dsq;
/*
@@ -4230,7 +4407,7 @@ out_unlock_rcu:
}
#ifdef CONFIG_EXT_GROUP_SCHED
-static void scx_cgroup_exit(void)
+static void scx_cgroup_exit(struct scx_sched *sch)
{
struct cgroup_subsys_state *css;
@@ -4246,18 +4423,19 @@ static void scx_cgroup_exit(void)
css_for_each_descendant_post(css, &root_task_group.css) {
struct task_group *tg = css_tg(css);
- if (!(tg->scx_flags & SCX_TG_INITED))
+ if (!(tg->scx.flags & SCX_TG_INITED))
continue;
- tg->scx_flags &= ~SCX_TG_INITED;
+ tg->scx.flags &= ~SCX_TG_INITED;
- if (!scx_ops.cgroup_exit)
+ if (!sch->ops.cgroup_exit)
continue;
if (WARN_ON_ONCE(!css_tryget(css)))
continue;
rcu_read_unlock();
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup);
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
+ css->cgroup);
rcu_read_lock();
css_put(css);
@@ -4265,16 +4443,13 @@ static void scx_cgroup_exit(void)
rcu_read_unlock();
}
-static int scx_cgroup_init(void)
+static int scx_cgroup_init(struct scx_sched *sch)
{
struct cgroup_subsys_state *css;
int ret;
percpu_rwsem_assert_held(&scx_cgroup_rwsem);
- cgroup_warned_missing_weight = false;
- cgroup_warned_missing_idle = false;
-
/*
* scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
* cgroups and init, all online cgroups are initialized.
@@ -4282,17 +4457,19 @@ static int scx_cgroup_init(void)
rcu_read_lock();
css_for_each_descendant_pre(css, &root_task_group.css) {
struct task_group *tg = css_tg(css);
- struct scx_cgroup_init_args args = { .weight = tg->scx_weight };
-
- scx_cgroup_warn_missing_weight(tg);
- scx_cgroup_warn_missing_idle(tg);
+ struct scx_cgroup_init_args args = {
+ .weight = tg->scx.weight,
+ .bw_period_us = tg->scx.bw_period_us,
+ .bw_quota_us = tg->scx.bw_quota_us,
+ .bw_burst_us = tg->scx.bw_burst_us,
+ };
- if ((tg->scx_flags &
+ if ((tg->scx.flags &
(SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
continue;
- if (!scx_ops.cgroup_init) {
- tg->scx_flags |= SCX_TG_INITED;
+ if (!sch->ops.cgroup_init) {
+ tg->scx.flags |= SCX_TG_INITED;
continue;
}
@@ -4300,14 +4477,14 @@ static int scx_cgroup_init(void)
continue;
rcu_read_unlock();
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL,
css->cgroup, &args);
if (ret) {
css_put(css);
- scx_ops_error("ops.cgroup_init() failed (%d)", ret);
+ scx_error(sch, "ops.cgroup_init() failed (%d)", ret);
return ret;
}
- tg->scx_flags |= SCX_TG_INITED;
+ tg->scx.flags |= SCX_TG_INITED;
rcu_read_lock();
css_put(css);
@@ -4321,8 +4498,8 @@ static int scx_cgroup_init(void)
}
#else
-static void scx_cgroup_exit(void) {}
-static int scx_cgroup_init(void) { return 0; }
+static void scx_cgroup_exit(struct scx_sched *sch) {}
+static int scx_cgroup_init(struct scx_sched *sch) { return 0; }
#endif
@@ -4339,8 +4516,7 @@ static int scx_cgroup_init(void) { return 0; }
static ssize_t scx_attr_state_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return sysfs_emit(buf, "%s\n",
- scx_ops_enable_state_str[scx_ops_enable_state()]);
+ return sysfs_emit(buf, "%s\n", scx_enable_state_str[scx_enable_state()]);
}
SCX_ATTR(state);
@@ -4385,15 +4561,51 @@ static const struct attribute_group scx_global_attr_group = {
.attrs = scx_global_attrs,
};
+static void free_exit_info(struct scx_exit_info *ei);
+
+static void scx_sched_free_rcu_work(struct work_struct *work)
+{
+ struct rcu_work *rcu_work = to_rcu_work(work);
+ struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work);
+ struct rhashtable_iter rht_iter;
+ struct scx_dispatch_q *dsq;
+ int node;
+
+ kthread_stop(sch->helper->task);
+ free_percpu(sch->event_stats_cpu);
+
+ for_each_node_state(node, N_POSSIBLE)
+ kfree(sch->global_dsqs[node]);
+ kfree(sch->global_dsqs);
+
+ rhashtable_walk_enter(&sch->dsq_hash, &rht_iter);
+ do {
+ rhashtable_walk_start(&rht_iter);
+
+ while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq))
+ destroy_dsq(sch, dsq->id);
+
+ rhashtable_walk_stop(&rht_iter);
+ } while (dsq == ERR_PTR(-EAGAIN));
+ rhashtable_walk_exit(&rht_iter);
+
+ rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
+ free_exit_info(sch->exit_info);
+ kfree(sch);
+}
+
static void scx_kobj_release(struct kobject *kobj)
{
- kfree(kobj);
+ struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
+
+ INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work);
+ queue_rcu_work(system_unbound_wq, &sch->rcu_work);
}
static ssize_t scx_attr_ops_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return sysfs_emit(buf, "%s\n", scx_ops.name);
+ return sysfs_emit(buf, "%s\n", scx_root->ops.name);
}
SCX_ATTR(ops);
@@ -4404,16 +4616,17 @@ SCX_ATTR(ops);
static ssize_t scx_attr_events_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
+ struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
struct scx_event_stats events;
int at = 0;
- scx_bpf_events(&events, sizeof(events));
+ scx_read_events(sch, &events);
at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK);
at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST);
at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING);
at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
- at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SLICE_DFL);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_REFILL_SLICE_DFL);
at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION);
at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH);
at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE);
@@ -4436,7 +4649,7 @@ static const struct kobj_type scx_ktype = {
static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
{
- return add_uevent_var(env, "SCXOPS=%s", scx_ops.name);
+ return add_uevent_var(env, "SCXOPS=%s", scx_root->ops.name);
}
static const struct kset_uevent_ops scx_uevent_ops = {
@@ -4449,14 +4662,55 @@ static const struct kset_uevent_ops scx_uevent_ops = {
*/
bool task_should_scx(int policy)
{
- if (!scx_enabled() ||
- unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING))
+ if (!scx_enabled() || unlikely(scx_enable_state() == SCX_DISABLING))
return false;
if (READ_ONCE(scx_switching_all))
return true;
return policy == SCHED_EXT;
}
+bool scx_allow_ttwu_queue(const struct task_struct *p)
+{
+ return !scx_enabled() ||
+ (scx_root->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) ||
+ p->sched_class != &ext_sched_class;
+}
+
+/**
+ * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler
+ *
+ * While there are various reasons why RCU CPU stalls can occur on a system
+ * that may not be caused by the current BPF scheduler, try kicking out the
+ * current scheduler in an attempt to recover the system to a good state before
+ * issuing panics.
+ */
+bool scx_rcu_cpu_stall(void)
+{
+ struct scx_sched *sch;
+
+ rcu_read_lock();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch)) {
+ rcu_read_unlock();
+ return false;
+ }
+
+ switch (scx_enable_state()) {
+ case SCX_ENABLING:
+ case SCX_ENABLED:
+ break;
+ default:
+ rcu_read_unlock();
+ return false;
+ }
+
+ scx_error(sch, "RCU CPU stall detected!");
+ rcu_read_unlock();
+
+ return true;
+}
+
/**
* scx_softlockup - sched_ext softlockup handler
* @dur_s: number of seconds of CPU stuck due to soft lockup
@@ -4469,40 +4723,48 @@ bool task_should_scx(int policy)
*/
void scx_softlockup(u32 dur_s)
{
- switch (scx_ops_enable_state()) {
- case SCX_OPS_ENABLING:
- case SCX_OPS_ENABLED:
+ struct scx_sched *sch;
+
+ rcu_read_lock();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ goto out_unlock;
+
+ switch (scx_enable_state()) {
+ case SCX_ENABLING:
+ case SCX_ENABLED:
break;
default:
- return;
+ goto out_unlock;
}
- /* allow only one instance, cleared at the end of scx_ops_bypass() */
+ /* allow only one instance, cleared at the end of scx_bypass() */
if (test_and_set_bit(0, &scx_in_softlockup))
- return;
+ goto out_unlock;
printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n",
- smp_processor_id(), dur_s, scx_ops.name);
+ smp_processor_id(), dur_s, scx_root->ops.name);
/*
* Some CPUs may be trapped in the dispatch paths. Enable breather
- * immediately; otherwise, we might even be able to get to
- * scx_ops_bypass().
+ * immediately; otherwise, we might even be able to get to scx_bypass().
*/
- atomic_inc(&scx_ops_breather_depth);
+ atomic_inc(&scx_breather_depth);
- scx_ops_error("soft lockup - CPU#%d stuck for %us",
- smp_processor_id(), dur_s);
+ scx_error(sch, "soft lockup - CPU#%d stuck for %us", smp_processor_id(), dur_s);
+out_unlock:
+ rcu_read_unlock();
}
static void scx_clear_softlockup(void)
{
if (test_and_clear_bit(0, &scx_in_softlockup))
- atomic_dec(&scx_ops_breather_depth);
+ atomic_dec(&scx_breather_depth);
}
/**
- * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
+ * scx_bypass - [Un]bypass scx_ops and guarantee forward progress
* @bypass: true for bypass, false for unbypass
*
* Bypassing guarantees that all runnable tasks make forward progress without
@@ -4532,32 +4794,36 @@ static void scx_clear_softlockup(void)
*
* - scx_prio_less() reverts to the default core_sched_at order.
*/
-static void scx_ops_bypass(bool bypass)
+static void scx_bypass(bool bypass)
{
static DEFINE_RAW_SPINLOCK(bypass_lock);
static unsigned long bypass_timestamp;
-
- int cpu;
+ struct scx_sched *sch;
unsigned long flags;
+ int cpu;
raw_spin_lock_irqsave(&bypass_lock, flags);
+ sch = rcu_dereference_bh(scx_root);
+
if (bypass) {
- scx_ops_bypass_depth++;
- WARN_ON_ONCE(scx_ops_bypass_depth <= 0);
- if (scx_ops_bypass_depth != 1)
+ scx_bypass_depth++;
+ WARN_ON_ONCE(scx_bypass_depth <= 0);
+ if (scx_bypass_depth != 1)
goto unlock;
bypass_timestamp = ktime_get_ns();
- scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1);
+ if (sch)
+ scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
} else {
- scx_ops_bypass_depth--;
- WARN_ON_ONCE(scx_ops_bypass_depth < 0);
- if (scx_ops_bypass_depth != 0)
+ scx_bypass_depth--;
+ WARN_ON_ONCE(scx_bypass_depth < 0);
+ if (scx_bypass_depth != 0)
goto unlock;
- scx_add_event(SCX_EV_BYPASS_DURATION,
- ktime_get_ns() - bypass_timestamp);
+ if (sch)
+ scx_add_event(sch, SCX_EV_BYPASS_DURATION,
+ ktime_get_ns() - bypass_timestamp);
}
- atomic_inc(&scx_ops_breather_depth);
+ atomic_inc(&scx_breather_depth);
/*
* No task property is changing. We just need to make sure all currently
@@ -4615,7 +4881,7 @@ static void scx_ops_bypass(bool bypass)
raw_spin_rq_unlock(rq);
}
- atomic_dec(&scx_ops_breather_depth);
+ atomic_dec(&scx_breather_depth);
unlock:
raw_spin_unlock_irqrestore(&bypass_lock, flags);
scx_clear_softlockup();
@@ -4623,7 +4889,7 @@ unlock:
static void free_exit_info(struct scx_exit_info *ei)
{
- kfree(ei->dump);
+ kvfree(ei->dump);
kfree(ei->msg);
kfree(ei->bt);
kfree(ei);
@@ -4639,7 +4905,7 @@ static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len)
ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL);
ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
- ei->dump = kzalloc(exit_dump_len, GFP_KERNEL);
+ ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL);
if (!ei->bt || !ei->msg || !ei->dump) {
free_exit_info(ei);
@@ -4671,42 +4937,36 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
}
}
-static void scx_ops_disable_workfn(struct kthread_work *work)
+static void scx_disable_workfn(struct kthread_work *work)
{
- struct scx_exit_info *ei = scx_exit_info;
+ struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
+ struct scx_exit_info *ei = sch->exit_info;
struct scx_task_iter sti;
struct task_struct *p;
- struct rhashtable_iter rht_iter;
- struct scx_dispatch_q *dsq;
- int i, kind, cpu;
+ int kind, cpu;
- kind = atomic_read(&scx_exit_kind);
+ kind = atomic_read(&sch->exit_kind);
while (true) {
- /*
- * NONE indicates that a new scx_ops has been registered since
- * disable was scheduled - don't kill the new ops. DONE
- * indicates that the ops has already been disabled.
- */
- if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
+ if (kind == SCX_EXIT_DONE) /* already disabled? */
return;
- if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE))
+ WARN_ON_ONCE(kind == SCX_EXIT_NONE);
+ if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
break;
}
ei->kind = kind;
ei->reason = scx_exit_reason(ei->kind);
/* guarantee forward progress by bypassing scx_ops */
- scx_ops_bypass(true);
+ scx_bypass(true);
- switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) {
- case SCX_OPS_DISABLING:
+ switch (scx_set_enable_state(SCX_DISABLING)) {
+ case SCX_DISABLING:
WARN_ONCE(true, "sched_ext: duplicate disabling instance?");
break;
- case SCX_OPS_DISABLED:
+ case SCX_DISABLED:
pr_warn("sched_ext: ops error detected without ops (%s)\n",
- scx_exit_info->msg);
- WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
- SCX_OPS_DISABLING);
+ sch->exit_info->msg);
+ WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
goto done;
default:
break;
@@ -4717,17 +4977,17 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
* we can safely use blocking synchronization constructs. Actually
* disable ops.
*/
- mutex_lock(&scx_ops_enable_mutex);
+ mutex_lock(&scx_enable_mutex);
static_branch_disable(&__scx_switched_all);
WRITE_ONCE(scx_switching_all, false);
/*
* Shut down cgroup support before tasks so that the cgroup attach path
- * doesn't race against scx_ops_exit_task().
+ * doesn't race against scx_exit_task().
*/
scx_cgroup_lock();
- scx_cgroup_exit();
+ scx_cgroup_exit(sch);
scx_cgroup_unlock();
/*
@@ -4736,7 +4996,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
*/
percpu_down_write(&scx_fork_rwsem);
- scx_ops_init_task_enabled = false;
+ scx_init_task_enabled = false;
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
@@ -4756,7 +5016,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
sched_enq_and_set_task(&ctx);
check_class_changed(task_rq(p), p, old_class, p->prio);
- scx_ops_exit_task(p);
+ scx_exit_task(p);
}
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
@@ -4771,98 +5031,71 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
}
/* no task is on scx, turn off all the switches and flush in-progress calls */
- static_branch_disable(&__scx_ops_enabled);
- for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
- static_branch_disable(&scx_has_op[i]);
- static_branch_disable(&scx_ops_allow_queued_wakeup);
- static_branch_disable(&scx_ops_enq_last);
- static_branch_disable(&scx_ops_enq_exiting);
- static_branch_disable(&scx_ops_enq_migration_disabled);
- static_branch_disable(&scx_ops_cpu_preempt);
+ static_branch_disable(&__scx_enabled);
+ bitmap_zero(sch->has_op, SCX_OPI_END);
scx_idle_disable();
synchronize_rcu();
if (ei->kind >= SCX_EXIT_ERROR) {
pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
- scx_ops.name, ei->reason);
+ sch->ops.name, ei->reason);
if (ei->msg[0] != '\0')
- pr_err("sched_ext: %s: %s\n", scx_ops.name, ei->msg);
+ pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg);
#ifdef CONFIG_STACKTRACE
stack_trace_print(ei->bt, ei->bt_len, 2);
#endif
} else {
pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
- scx_ops.name, ei->reason);
+ sch->ops.name, ei->reason);
}
- if (scx_ops.exit)
- SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei);
+ if (sch->ops.exit)
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei);
cancel_delayed_work_sync(&scx_watchdog_work);
/*
- * Delete the kobject from the hierarchy eagerly in addition to just
- * dropping a reference. Otherwise, if the object is deleted
- * asynchronously, sysfs could observe an object of the same name still
- * in the hierarchy when another scheduler is loaded.
+ * scx_root clearing must be inside cpus_read_lock(). See
+ * handle_hotplug().
*/
- kobject_del(scx_root_kobj);
- kobject_put(scx_root_kobj);
- scx_root_kobj = NULL;
-
- memset(&scx_ops, 0, sizeof(scx_ops));
-
- rhashtable_walk_enter(&dsq_hash, &rht_iter);
- do {
- rhashtable_walk_start(&rht_iter);
-
- while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq))
- destroy_dsq(dsq->id);
+ cpus_read_lock();
+ RCU_INIT_POINTER(scx_root, NULL);
+ cpus_read_unlock();
- rhashtable_walk_stop(&rht_iter);
- } while (dsq == ERR_PTR(-EAGAIN));
- rhashtable_walk_exit(&rht_iter);
+ /*
+ * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs
+ * could observe an object of the same name still in the hierarchy when
+ * the next scheduler is loaded.
+ */
+ kobject_del(&sch->kobj);
free_percpu(scx_dsp_ctx);
scx_dsp_ctx = NULL;
scx_dsp_max_batch = 0;
- free_exit_info(scx_exit_info);
- scx_exit_info = NULL;
+ mutex_unlock(&scx_enable_mutex);
- mutex_unlock(&scx_ops_enable_mutex);
-
- WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
- SCX_OPS_DISABLING);
+ WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
done:
- scx_ops_bypass(false);
-}
-
-static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn);
-
-static void schedule_scx_ops_disable_work(void)
-{
- struct kthread_worker *helper = READ_ONCE(scx_ops_helper);
-
- /*
- * We may be called spuriously before the first bpf_sched_ext_reg(). If
- * scx_ops_helper isn't set up yet, there's nothing to do.
- */
- if (helper)
- kthread_queue_work(helper, &scx_ops_disable_work);
+ scx_bypass(false);
}
-static void scx_ops_disable(enum scx_exit_kind kind)
+static void scx_disable(enum scx_exit_kind kind)
{
int none = SCX_EXIT_NONE;
+ struct scx_sched *sch;
if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
kind = SCX_EXIT_ERROR;
- atomic_try_cmpxchg(&scx_exit_kind, &none, kind);
-
- schedule_scx_ops_disable_work();
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (sch) {
+ atomic_try_cmpxchg(&sch->exit_kind, &none, kind);
+ kthread_queue_work(sch->helper, &sch->disable_work);
+ }
+ rcu_read_unlock();
}
static void dump_newline(struct seq_buf *s)
@@ -4980,6 +5213,7 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
struct task_struct *p, char marker)
{
static unsigned long bt[SCX_EXIT_BT_LEN];
+ struct scx_sched *sch = scx_root;
char dsq_id_buf[19] = "(n/a)";
unsigned long ops_state = atomic_long_read(&p->scx.ops_state);
unsigned int bt_len = 0;
@@ -5002,9 +5236,9 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
p->scx.dsq_vtime, p->scx.slice, p->scx.weight);
dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
- if (SCX_HAS_OP(dump_task)) {
+ if (SCX_HAS_OP(sch, dump_task)) {
ops_dump_init(s, " ");
- SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p);
+ SCX_CALL_OP(sch, SCX_KF_REST, dump_task, NULL, dctx, p);
ops_dump_exit();
}
@@ -5021,6 +5255,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
{
static DEFINE_SPINLOCK(dump_lock);
static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n";
+ struct scx_sched *sch = scx_root;
struct scx_dump_ctx dctx = {
.kind = ei->kind,
.exit_code = ei->exit_code,
@@ -5049,9 +5284,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
dump_stack_trace(&s, " ", ei->bt, ei->bt_len);
}
- if (SCX_HAS_OP(dump)) {
+ if (SCX_HAS_OP(sch, dump)) {
ops_dump_init(&s, "");
- SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx);
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, dump, NULL, &dctx);
ops_dump_exit();
}
@@ -5072,7 +5307,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
idle = list_empty(&rq->scx.runnable_list) &&
rq->curr->sched_class == &idle_sched_class;
- if (idle && !SCX_HAS_OP(dump_cpu))
+ if (idle && !SCX_HAS_OP(sch, dump_cpu))
goto next;
/*
@@ -5106,9 +5341,10 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
cpumask_pr_args(rq->scx.cpus_to_wait));
used = seq_buf_used(&ns);
- if (SCX_HAS_OP(dump_cpu)) {
+ if (SCX_HAS_OP(sch, dump_cpu)) {
ops_dump_init(&ns, " ");
- SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle);
+ SCX_CALL_OP(sch, SCX_KF_REST, dump_cpu, NULL,
+ &dctx, cpu, idle);
ops_dump_exit();
}
@@ -5142,13 +5378,13 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
dump_line(&s, "Event counters");
dump_line(&s, "--------------");
- scx_bpf_events(&events, sizeof(events));
+ scx_read_events(sch, &events);
scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK);
scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING);
scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
- scx_dump_event(s, &events, SCX_EV_ENQ_SLICE_DFL);
+ scx_dump_event(s, &events, SCX_EV_REFILL_SLICE_DFL);
scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION);
scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH);
scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE);
@@ -5160,27 +5396,25 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
spin_unlock_irqrestore(&dump_lock, flags);
}
-static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
+static void scx_error_irq_workfn(struct irq_work *irq_work)
{
- struct scx_exit_info *ei = scx_exit_info;
+ struct scx_sched *sch = container_of(irq_work, struct scx_sched, error_irq_work);
+ struct scx_exit_info *ei = sch->exit_info;
if (ei->kind >= SCX_EXIT_ERROR)
- scx_dump_state(ei, scx_ops.exit_dump_len);
+ scx_dump_state(ei, sch->ops.exit_dump_len);
- schedule_scx_ops_disable_work();
+ kthread_queue_work(sch->helper, &sch->disable_work);
}
-static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn);
-
-static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
- s64 exit_code,
- const char *fmt, ...)
+static void scx_vexit(struct scx_sched *sch,
+ enum scx_exit_kind kind, s64 exit_code,
+ const char *fmt, va_list args)
{
- struct scx_exit_info *ei = scx_exit_info;
+ struct scx_exit_info *ei = sch->exit_info;
int none = SCX_EXIT_NONE;
- va_list args;
- if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind))
+ if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind))
return;
ei->exit_code = exit_code;
@@ -5188,31 +5422,98 @@ static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
if (kind >= SCX_EXIT_ERROR)
ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1);
#endif
- va_start(args, fmt);
vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
- va_end(args);
/*
* Set ei->kind and ->reason for scx_dump_state(). They'll be set again
- * in scx_ops_disable_workfn().
+ * in scx_disable_workfn().
*/
ei->kind = kind;
ei->reason = scx_exit_reason(ei->kind);
- irq_work_queue(&scx_ops_error_irq_work);
+ irq_work_queue(&sch->error_irq_work);
}
-static struct kthread_worker *scx_create_rt_helper(const char *name)
+static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
{
- struct kthread_worker *helper;
+ struct scx_sched *sch;
+ int node, ret;
- helper = kthread_run_worker(0, name);
- if (helper)
- sched_set_fifo(helper->task);
- return helper;
-}
+ sch = kzalloc(sizeof(*sch), GFP_KERNEL);
+ if (!sch)
+ return ERR_PTR(-ENOMEM);
+
+ sch->exit_info = alloc_exit_info(ops->exit_dump_len);
+ if (!sch->exit_info) {
+ ret = -ENOMEM;
+ goto err_free_sch;
+ }
+
+ ret = rhashtable_init(&sch->dsq_hash, &dsq_hash_params);
+ if (ret < 0)
+ goto err_free_ei;
+
+ sch->global_dsqs = kcalloc(nr_node_ids, sizeof(sch->global_dsqs[0]),
+ GFP_KERNEL);
+ if (!sch->global_dsqs) {
+ ret = -ENOMEM;
+ goto err_free_hash;
+ }
+
+ for_each_node_state(node, N_POSSIBLE) {
+ struct scx_dispatch_q *dsq;
+
+ dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node);
+ if (!dsq) {
+ ret = -ENOMEM;
+ goto err_free_gdsqs;
+ }
+
+ init_dsq(dsq, SCX_DSQ_GLOBAL);
+ sch->global_dsqs[node] = dsq;
+ }
+
+ sch->event_stats_cpu = alloc_percpu(struct scx_event_stats);
+ if (!sch->event_stats_cpu)
+ goto err_free_gdsqs;
+
+ sch->helper = kthread_run_worker(0, "sched_ext_helper");
+ if (!sch->helper)
+ goto err_free_event_stats;
+ sched_set_fifo(sch->helper->task);
-static void check_hotplug_seq(const struct sched_ext_ops *ops)
+ atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
+ init_irq_work(&sch->error_irq_work, scx_error_irq_workfn);
+ kthread_init_work(&sch->disable_work, scx_disable_workfn);
+ sch->ops = *ops;
+ ops->priv = sch;
+
+ sch->kobj.kset = scx_kset;
+ ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
+ if (ret < 0)
+ goto err_stop_helper;
+
+ return sch;
+
+err_stop_helper:
+ kthread_stop(sch->helper->task);
+err_free_event_stats:
+ free_percpu(sch->event_stats_cpu);
+err_free_gdsqs:
+ for_each_node_state(node, N_POSSIBLE)
+ kfree(sch->global_dsqs[node]);
+ kfree(sch->global_dsqs);
+err_free_hash:
+ rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
+err_free_ei:
+ free_exit_info(sch->exit_info);
+err_free_sch:
+ kfree(sch);
+ return ERR_PTR(ret);
+}
+
+static void check_hotplug_seq(struct scx_sched *sch,
+ const struct sched_ext_ops *ops)
{
unsigned long long global_hotplug_seq;
@@ -5224,21 +5525,22 @@ static void check_hotplug_seq(const struct sched_ext_ops *ops)
if (ops->hotplug_seq) {
global_hotplug_seq = atomic_long_read(&scx_hotplug_seq);
if (ops->hotplug_seq != global_hotplug_seq) {
- scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
- "expected hotplug seq %llu did not match actual %llu",
- ops->hotplug_seq, global_hotplug_seq);
+ scx_exit(sch, SCX_EXIT_UNREG_KERN,
+ SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
+ "expected hotplug seq %llu did not match actual %llu",
+ ops->hotplug_seq, global_hotplug_seq);
}
}
}
-static int validate_ops(const struct sched_ext_ops *ops)
+static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
{
/*
* It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the
* ops.enqueue() callback isn't implemented.
*/
if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) {
- scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented");
+ scx_error(sch, "SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented");
return -EINVAL;
}
@@ -5248,19 +5550,23 @@ static int validate_ops(const struct sched_ext_ops *ops)
*/
if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) &&
(ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) {
- scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled");
+ scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled");
return -EINVAL;
}
+ if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT)
+ pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n");
+
return 0;
}
-static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
{
+ struct scx_sched *sch;
struct scx_task_iter sti;
struct task_struct *p;
unsigned long timeout;
- int i, cpu, node, ret;
+ int i, cpu, ret;
if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
cpu_possible_mask)) {
@@ -5268,87 +5574,25 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
return -EINVAL;
}
- mutex_lock(&scx_ops_enable_mutex);
-
- /*
- * Clear event counters so a new scx scheduler gets
- * fresh event counter values.
- */
- for_each_possible_cpu(cpu) {
- struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu);
- memset(e, 0, sizeof(*e));
- }
-
- if (!scx_ops_helper) {
- WRITE_ONCE(scx_ops_helper,
- scx_create_rt_helper("sched_ext_ops_helper"));
- if (!scx_ops_helper) {
- ret = -ENOMEM;
- goto err_unlock;
- }
- }
+ mutex_lock(&scx_enable_mutex);
- if (!global_dsqs) {
- struct scx_dispatch_q **dsqs;
-
- dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL);
- if (!dsqs) {
- ret = -ENOMEM;
- goto err_unlock;
- }
-
- for_each_node_state(node, N_POSSIBLE) {
- struct scx_dispatch_q *dsq;
-
- dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node);
- if (!dsq) {
- for_each_node_state(node, N_POSSIBLE)
- kfree(dsqs[node]);
- kfree(dsqs);
- ret = -ENOMEM;
- goto err_unlock;
- }
-
- init_dsq(dsq, SCX_DSQ_GLOBAL);
- dsqs[node] = dsq;
- }
-
- global_dsqs = dsqs;
- }
-
- if (scx_ops_enable_state() != SCX_OPS_DISABLED) {
+ if (scx_enable_state() != SCX_DISABLED) {
ret = -EBUSY;
goto err_unlock;
}
- scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL);
- if (!scx_root_kobj) {
- ret = -ENOMEM;
+ sch = scx_alloc_and_add_sched(ops);
+ if (IS_ERR(sch)) {
+ ret = PTR_ERR(sch);
goto err_unlock;
}
- scx_root_kobj->kset = scx_kset;
- ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root");
- if (ret < 0)
- goto err;
-
- scx_exit_info = alloc_exit_info(ops->exit_dump_len);
- if (!scx_exit_info) {
- ret = -ENOMEM;
- goto err_del;
- }
-
/*
- * Set scx_ops, transition to ENABLING and clear exit info to arm the
- * disable path. Failure triggers full disabling from here on.
+ * Transition to ENABLING and clear exit info to arm the disable path.
+ * Failure triggers full disabling from here on.
*/
- scx_ops = *ops;
-
- WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_ENABLING) !=
- SCX_OPS_DISABLED);
-
- atomic_set(&scx_exit_kind, SCX_EXIT_NONE);
- scx_warned_zero_slice = false;
+ WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
+ WARN_ON_ONCE(scx_root);
atomic_long_set(&scx_nr_rejected, 0);
@@ -5361,26 +5605,34 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
*/
cpus_read_lock();
- if (scx_ops.init) {
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init);
+ /*
+ * Make the scheduler instance visible. Must be inside cpus_read_lock().
+ * See handle_hotplug().
+ */
+ rcu_assign_pointer(scx_root, sch);
+
+ scx_idle_enable(ops);
+
+ if (sch->ops.init) {
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL);
if (ret) {
- ret = ops_sanitize_err("init", ret);
+ ret = ops_sanitize_err(sch, "init", ret);
cpus_read_unlock();
- scx_ops_error("ops.init() failed (%d)", ret);
+ scx_error(sch, "ops.init() failed (%d)", ret);
goto err_disable;
}
}
for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
if (((void (**)(void))ops)[i])
- static_branch_enable_cpuslocked(&scx_has_op[i]);
+ set_bit(i, sch->has_op);
- check_hotplug_seq(ops);
+ check_hotplug_seq(sch, ops);
scx_idle_update_selcpu_topology(ops);
cpus_read_unlock();
- ret = validate_ops(ops);
+ ret = validate_ops(sch, ops);
if (ret)
goto err_disable;
@@ -5405,29 +5657,19 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_watchdog_timeout / 2);
/*
- * Once __scx_ops_enabled is set, %current can be switched to SCX
- * anytime. This can lead to stalls as some BPF schedulers (e.g.
- * userspace scheduling) may not function correctly before all tasks are
- * switched. Init in bypass mode to guarantee forward progress.
+ * Once __scx_enabled is set, %current can be switched to SCX anytime.
+ * This can lead to stalls as some BPF schedulers (e.g. userspace
+ * scheduling) may not function correctly before all tasks are switched.
+ * Init in bypass mode to guarantee forward progress.
*/
- scx_ops_bypass(true);
+ scx_bypass(true);
for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
if (((void (**)(void))ops)[i])
- static_branch_enable(&scx_has_op[i]);
-
- if (ops->flags & SCX_OPS_ALLOW_QUEUED_WAKEUP)
- static_branch_enable(&scx_ops_allow_queued_wakeup);
- if (ops->flags & SCX_OPS_ENQ_LAST)
- static_branch_enable(&scx_ops_enq_last);
- if (ops->flags & SCX_OPS_ENQ_EXITING)
- static_branch_enable(&scx_ops_enq_exiting);
- if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED)
- static_branch_enable(&scx_ops_enq_migration_disabled);
- if (scx_ops.cpu_acquire || scx_ops.cpu_release)
- static_branch_enable(&scx_ops_cpu_preempt);
+ set_bit(i, sch->has_op);
- scx_idle_enable(ops);
+ if (sch->ops.cpu_acquire || sch->ops.cpu_release)
+ sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
/*
* Lock out forks, cgroup on/offlining and moves before opening the
@@ -5435,8 +5677,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
*/
percpu_down_write(&scx_fork_rwsem);
- WARN_ON_ONCE(scx_ops_init_task_enabled);
- scx_ops_init_task_enabled = true;
+ WARN_ON_ONCE(scx_init_task_enabled);
+ scx_init_task_enabled = true;
/*
* Enable ops for every task. Fork is excluded by scx_fork_rwsem
@@ -5445,14 +5687,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
* tasks. Prep all tasks first and then enable them with preemption
* disabled.
*
- * All cgroups should be initialized before scx_ops_init_task() so that
- * the BPF scheduler can reliably track each task's cgroup membership
- * from scx_ops_init_task(). Lock out cgroup on/offlining and task
- * migrations while tasks are being initialized so that
- * scx_cgroup_can_attach() never sees uninitialized tasks.
+ * All cgroups should be initialized before scx_init_task() so that the
+ * BPF scheduler can reliably track each task's cgroup membership from
+ * scx_init_task(). Lock out cgroup on/offlining and task migrations
+ * while tasks are being initialized so that scx_cgroup_can_attach()
+ * never sees uninitialized tasks.
*/
scx_cgroup_lock();
- ret = scx_cgroup_init();
+ ret = scx_cgroup_init(sch);
if (ret)
goto err_disable_unlock_all;
@@ -5468,13 +5710,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_task_iter_unlock(&sti);
- ret = scx_ops_init_task(p, task_group(p), false);
+ ret = scx_init_task(p, task_group(p), false);
if (ret) {
put_task_struct(p);
scx_task_iter_relock(&sti);
scx_task_iter_stop(&sti);
- scx_ops_error("ops.init_task() failed (%d) for %s[%d]",
- ret, p->comm, p->pid);
+ scx_error(sch, "ops.init_task() failed (%d) for %s[%d]",
+ ret, p->comm, p->pid);
goto err_disable_unlock_all;
}
@@ -5492,7 +5734,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
* all eligible tasks.
*/
WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
- static_branch_enable(&__scx_ops_enabled);
+ static_branch_enable(&__scx_enabled);
/*
* We're fully committed and can't fail. The task READY -> ENABLED
@@ -5523,10 +5765,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
- scx_ops_bypass(false);
+ scx_bypass(false);
- if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
- WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
+ if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
+ WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE);
goto err_disable;
}
@@ -5534,44 +5776,35 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
static_branch_enable(&__scx_switched_all);
pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
- scx_ops.name, scx_switched_all() ? "" : " (partial)");
- kobject_uevent(scx_root_kobj, KOBJ_ADD);
- mutex_unlock(&scx_ops_enable_mutex);
+ sch->ops.name, scx_switched_all() ? "" : " (partial)");
+ kobject_uevent(&sch->kobj, KOBJ_ADD);
+ mutex_unlock(&scx_enable_mutex);
atomic_long_inc(&scx_enable_seq);
return 0;
-err_del:
- kobject_del(scx_root_kobj);
-err:
- kobject_put(scx_root_kobj);
- scx_root_kobj = NULL;
- if (scx_exit_info) {
- free_exit_info(scx_exit_info);
- scx_exit_info = NULL;
- }
err_unlock:
- mutex_unlock(&scx_ops_enable_mutex);
+ mutex_unlock(&scx_enable_mutex);
return ret;
err_disable_unlock_all:
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
- scx_ops_bypass(false);
+ scx_bypass(false);
err_disable:
- mutex_unlock(&scx_ops_enable_mutex);
+ mutex_unlock(&scx_enable_mutex);
/*
* Returning an error code here would not pass all the error information
- * to userspace. Record errno using scx_ops_error() for cases
- * scx_ops_error() wasn't already invoked and exit indicating success so
- * that the error is notified through ops.exit() with all the details.
+ * to userspace. Record errno using scx_error() for cases scx_error()
+ * wasn't already invoked and exit indicating success so that the error
+ * is notified through ops.exit() with all the details.
*
- * Flush scx_ops_disable_work to ensure that error is reported before
- * init completion.
+ * Flush scx_disable_work to ensure that error is reported before init
+ * completion. sch's base reference will be put by bpf_scx_unreg().
*/
- scx_ops_error("scx_ops_enable() failed (%d)", ret);
- kthread_flush_work(&scx_ops_disable_work);
+ scx_error(sch, "scx_enable() failed (%d)", ret);
+ kthread_flush_work(&sch->disable_work);
return 0;
}
@@ -5622,21 +5855,8 @@ static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
return -EACCES;
}
-static const struct bpf_func_proto *
-bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
-{
- switch (func_id) {
- case BPF_FUNC_task_storage_get:
- return &bpf_task_storage_get_proto;
- case BPF_FUNC_task_storage_delete:
- return &bpf_task_storage_delete_proto;
- default:
- return bpf_base_func_proto(func_id, prog);
- }
-}
-
static const struct bpf_verifier_ops bpf_scx_verifier_ops = {
- .get_func_proto = bpf_scx_get_func_proto,
+ .get_func_proto = bpf_base_func_proto,
.is_valid_access = bpf_scx_is_valid_access,
.btf_struct_access = bpf_scx_btf_struct_access,
};
@@ -5715,13 +5935,17 @@ static int bpf_scx_check_member(const struct btf_type *t,
static int bpf_scx_reg(void *kdata, struct bpf_link *link)
{
- return scx_ops_enable(kdata, link);
+ return scx_enable(kdata, link);
}
static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
{
- scx_ops_disable(SCX_EXIT_UNREG);
- kthread_flush_work(&scx_ops_disable_work);
+ struct sched_ext_ops *ops = kdata;
+ struct scx_sched *sch = ops->priv;
+
+ scx_disable(SCX_EXIT_UNREG);
+ kthread_flush_work(&sch->disable_work);
+ kobject_put(&sch->kobj);
}
static int bpf_scx_init(struct btf *btf)
@@ -5775,6 +5999,7 @@ static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup
static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {}
+static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {}
#endif
static void sched_ext_ops__cpu_online(s32 cpu) {}
static void sched_ext_ops__cpu_offline(s32 cpu) {}
@@ -5812,6 +6037,7 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
.cgroup_move = sched_ext_ops__cgroup_move,
.cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move,
.cgroup_set_weight = sched_ext_ops__cgroup_set_weight,
+ .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth,
#endif
.cpu_online = sched_ext_ops__cpu_online,
.cpu_offline = sched_ext_ops__cpu_offline,
@@ -5843,10 +6069,7 @@ static struct bpf_struct_ops bpf_sched_ext_ops = {
static void sysrq_handle_sched_ext_reset(u8 key)
{
- if (scx_ops_helper)
- scx_ops_disable(SCX_EXIT_SYSRQ);
- else
- pr_info("sched_ext: BPF scheduler not yet used\n");
+ scx_disable(SCX_EXIT_SYSRQ);
}
static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
@@ -5994,13 +6217,14 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
*/
void print_scx_info(const char *log_lvl, struct task_struct *p)
{
- enum scx_ops_enable_state state = scx_ops_enable_state();
+ struct scx_sched *sch = scx_root;
+ enum scx_enable_state state = scx_enable_state();
const char *all = READ_ONCE(scx_switching_all) ? "+all" : "";
char runnable_at_buf[22] = "?";
struct sched_class *class;
unsigned long runnable_at;
- if (state == SCX_OPS_DISABLED)
+ if (state == SCX_DISABLED)
return;
/*
@@ -6009,8 +6233,8 @@ void print_scx_info(const char *log_lvl, struct task_struct *p)
*/
if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) ||
class != &ext_sched_class) {
- printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name,
- scx_ops_enable_state_str[state], all);
+ printk("%sSched_ext: %s (%s%s)", log_lvl, sch->ops.name,
+ scx_enable_state_str[state], all);
return;
}
@@ -6021,7 +6245,7 @@ void print_scx_info(const char *log_lvl, struct task_struct *p)
/* print everything onto one line to conserve console space */
printk("%sSched_ext: %s (%s%s), task: runnable_at=%s",
- log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all,
+ log_lvl, sch->ops.name, scx_enable_state_str[state], all,
runnable_at_buf);
}
@@ -6037,12 +6261,12 @@ static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *
case PM_HIBERNATION_PREPARE:
case PM_SUSPEND_PREPARE:
case PM_RESTORE_PREPARE:
- scx_ops_bypass(true);
+ scx_bypass(true);
break;
case PM_POST_HIBERNATION:
case PM_POST_SUSPEND:
case PM_POST_RESTORE:
- scx_ops_bypass(false);
+ scx_bypass(false);
break;
}
@@ -6065,7 +6289,6 @@ void __init init_sched_ext_class(void)
WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT |
SCX_TG_ONLINE);
- BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
scx_idle_init_masks();
scx_kick_cpus_pnt_seqs =
@@ -6109,12 +6332,12 @@ static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
lockdep_assert_irqs_disabled();
if (unlikely(!p)) {
- scx_ops_error("called with NULL task");
+ scx_kf_error("called with NULL task");
return false;
}
if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) {
- scx_ops_error("invalid enq_flags 0x%llx", enq_flags);
+ scx_kf_error("invalid enq_flags 0x%llx", enq_flags);
return false;
}
@@ -6134,7 +6357,7 @@ static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id,
}
if (unlikely(dspc->cursor >= scx_dsp_max_batch)) {
- scx_ops_error("dispatch buffer overflow");
+ scx_kf_error("dispatch buffer overflow");
return;
}
@@ -6172,7 +6395,8 @@ __bpf_kfunc_start_defs();
* When called from ops.dispatch(), there are no restrictions on @p or @dsq_id
* and this function can be called upto ops.dispatch_max_batch times to insert
* multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the
- * remaining slots. scx_bpf_consume() flushes the batch and resets the counter.
+ * remaining slots. scx_bpf_dsq_move_to_local() flushes the batch and resets the
+ * counter.
*
* This function doesn't have any locking restrictions and may be called under
* BPF locks (in the future when BPF introduces more flexible locking).
@@ -6196,14 +6420,6 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice
scx_dsq_insert_commit(p, dsq_id, enq_flags);
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
- u64 enq_flags)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()");
- scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags);
-}
-
/**
* scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ
* @p: task_struct to insert
@@ -6241,21 +6457,11 @@ __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id,
- u64 slice, u64 vtime, u64 enq_flags)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()");
- scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags);
-}
-
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch)
BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch)
static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
@@ -6266,6 +6472,7 @@ static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
struct task_struct *p, u64 dsq_id, u64 enq_flags)
{
+ struct scx_sched *sch = scx_root;
struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
struct rq *this_rq, *src_rq, *locked_rq;
bool dispatched = false;
@@ -6300,7 +6507,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
* cause similar live-lock conditions as consume_dispatch_q(). Insert a
* breather if necessary.
*/
- scx_ops_breather(src_rq);
+ scx_breather(src_rq);
locked_rq = src_rq;
raw_spin_lock(&src_dsq->lock);
@@ -6318,7 +6525,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
}
/* @p is still on $src_dsq and stable, determine the destination */
- dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p);
+ dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, p);
/*
* Apply vtime and slice updates before moving so that the new time is
@@ -6331,7 +6538,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
p->scx.slice = kit->slice;
/* execute move */
- locked_rq = move_task_between_dsqs(p, enq_flags, src_dsq, dst_dsq);
+ locked_rq = move_task_between_dsqs(sch, p, enq_flags, src_dsq, dst_dsq);
dispatched = true;
out:
if (in_balance) {
@@ -6379,7 +6586,7 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
if (dspc->cursor > 0)
dspc->cursor--;
else
- scx_ops_error("dispatch buffer underflow");
+ scx_kf_error("dispatch buffer underflow");
}
/**
@@ -6398,21 +6605,22 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
*/
__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
{
+ struct scx_sched *sch = scx_root;
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
struct scx_dispatch_q *dsq;
if (!scx_kf_allowed(SCX_KF_DISPATCH))
return false;
- flush_dispatch_buf(dspc->rq);
+ flush_dispatch_buf(sch, dspc->rq);
- dsq = find_user_dsq(dsq_id);
+ dsq = find_user_dsq(sch, dsq_id);
if (unlikely(!dsq)) {
- scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id);
+ scx_error(sch, "invalid DSQ ID 0x%016llx", dsq_id);
return false;
}
- if (consume_dispatch_q(dspc->rq, dsq)) {
+ if (consume_dispatch_q(sch, dspc->rq, dsq)) {
/*
* A successfully consumed task can be dequeued before it starts
* running while the CPU is trying to migrate other dispatched
@@ -6426,13 +6634,6 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
}
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()");
- return scx_bpf_dsq_move_to_local(dsq_id);
-}
-
/**
* scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs
* @it__iter: DSQ iterator in progress
@@ -6451,14 +6652,6 @@ __bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter,
kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE;
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice(
- struct bpf_iter_scx_dsq *it__iter, u64 slice)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()");
- scx_bpf_dsq_move_set_slice(it__iter, slice);
-}
-
/**
* scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs
* @it__iter: DSQ iterator in progress
@@ -6478,14 +6671,6 @@ __bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter,
kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME;
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
- struct bpf_iter_scx_dsq *it__iter, u64 vtime)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()");
- scx_bpf_dsq_move_set_vtime(it__iter, vtime);
-}
-
/**
* scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ
* @it__iter: DSQ iterator in progress
@@ -6518,15 +6703,6 @@ __bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter,
p, dsq_id, enq_flags);
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter,
- struct task_struct *p, u64 dsq_id,
- u64 enq_flags)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()");
- return scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags);
-}
-
/**
* scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ
* @it__iter: DSQ iterator in progress
@@ -6552,30 +6728,16 @@ __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter,
p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter,
- struct task_struct *p, u64 dsq_id,
- u64 enq_flags)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_vtime() renamed to scx_bpf_dsq_move_vtime()");
- return scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags);
-}
-
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local)
-BTF_ID_FLAGS(func, scx_bpf_consume)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
@@ -6666,10 +6828,36 @@ __bpf_kfunc_start_defs();
*/
__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
{
+ struct scx_dispatch_q *dsq;
+ struct scx_sched *sch;
+ s32 ret;
+
if (unlikely(node >= (int)nr_node_ids ||
(node < 0 && node != NUMA_NO_NODE)))
return -EINVAL;
- return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node));
+
+ if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN))
+ return -EINVAL;
+
+ dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node);
+ if (!dsq)
+ return -ENOMEM;
+
+ init_dsq(dsq, dsq_id);
+
+ rcu_read_lock();
+
+ sch = rcu_dereference(scx_root);
+ if (sch)
+ ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node,
+ dsq_hash_params);
+ else
+ ret = -ENODEV;
+
+ rcu_read_unlock();
+ if (ret)
+ kfree(dsq);
+ return ret;
}
__bpf_kfunc_end_defs();
@@ -6680,10 +6868,6 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_unlocked)
static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = {
@@ -6708,7 +6892,7 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
struct rq *this_rq;
unsigned long irq_flags;
- if (!ops_cpu_valid(cpu, NULL))
+ if (!kf_cpu_valid(cpu, NULL))
return;
local_irq_save(irq_flags);
@@ -6732,7 +6916,7 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
struct rq *target_rq = cpu_rq(cpu);
if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT)))
- scx_ops_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
+ scx_kf_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
if (raw_spin_rq_trylock(target_rq)) {
if (can_skip_idle_kick(target_rq)) {
@@ -6765,23 +6949,30 @@ out:
*/
__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
{
+ struct scx_sched *sch;
struct scx_dispatch_q *dsq;
s32 ret;
preempt_disable();
+ sch = rcu_dereference_sched(scx_root);
+ if (unlikely(!sch)) {
+ ret = -ENODEV;
+ goto out;
+ }
+
if (dsq_id == SCX_DSQ_LOCAL) {
ret = READ_ONCE(this_rq()->scx.local_dsq.nr);
goto out;
} else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
- if (ops_cpu_valid(cpu, NULL)) {
+ if (ops_cpu_valid(sch, cpu, NULL)) {
ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr);
goto out;
}
} else {
- dsq = find_user_dsq(dsq_id);
+ dsq = find_user_dsq(sch, dsq_id);
if (dsq) {
ret = READ_ONCE(dsq->nr);
goto out;
@@ -6804,7 +6995,13 @@ out:
*/
__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id)
{
- destroy_dsq(dsq_id);
+ struct scx_sched *sch;
+
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (sch)
+ destroy_dsq(sch, dsq_id);
+ rcu_read_unlock();
}
/**
@@ -6821,16 +7018,27 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
u64 flags)
{
struct bpf_iter_scx_dsq_kern *kit = (void *)it;
+ struct scx_sched *sch;
BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) >
sizeof(struct bpf_iter_scx_dsq));
BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) !=
__alignof__(struct bpf_iter_scx_dsq));
+ /*
+ * next() and destroy() will be called regardless of the return value.
+ * Always clear $kit->dsq.
+ */
+ kit->dsq = NULL;
+
+ sch = rcu_dereference_check(scx_root, rcu_read_lock_bh_held());
+ if (unlikely(!sch))
+ return -ENODEV;
+
if (flags & ~__SCX_DSQ_ITER_USER_FLAGS)
return -EINVAL;
- kit->dsq = find_user_dsq(dsq_id);
+ kit->dsq = find_user_dsq(sch, dsq_id);
if (!kit->dsq)
return -ENOENT;
@@ -6920,21 +7128,20 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 ||
(data__sz && !data)) {
- scx_ops_error("invalid data=%p and data__sz=%u",
- (void *)data, data__sz);
+ scx_kf_error("invalid data=%p and data__sz=%u", (void *)data, data__sz);
return -EINVAL;
}
ret = copy_from_kernel_nofault(data_buf, data, data__sz);
if (ret < 0) {
- scx_ops_error("failed to read data fields (%d)", ret);
+ scx_kf_error("failed to read data fields (%d)", ret);
return ret;
}
ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8,
&bprintf_data);
if (ret < 0) {
- scx_ops_error("format preparation failed (%d)", ret);
+ scx_kf_error("format preparation failed (%d)", ret);
return ret;
}
@@ -6942,8 +7149,7 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
bprintf_data.bin_args);
bpf_bprintf_cleanup(&bprintf_data);
if (ret < 0) {
- scx_ops_error("(\"%s\", %p, %u) failed to format",
- fmt, data, data__sz);
+ scx_kf_error("(\"%s\", %p, %u) failed to format", fmt, data, data__sz);
return ret;
}
@@ -6976,8 +7182,7 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
- scx_ops_exit_kind(SCX_EXIT_UNREG_BPF, exit_code, "%s",
- scx_exit_bstr_buf.line);
+ scx_kf_exit(SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line);
raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
}
@@ -6997,8 +7202,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
- scx_ops_exit_kind(SCX_EXIT_ERROR_BPF, 0, "%s",
- scx_exit_bstr_buf.line);
+ scx_kf_exit(SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line);
raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
}
@@ -7022,7 +7226,7 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
s32 ret;
if (raw_smp_processor_id() != dd->cpu) {
- scx_ops_error("scx_bpf_dump() must only be called from ops.dump() and friends");
+ scx_kf_error("scx_bpf_dump() must only be called from ops.dump() and friends");
return;
}
@@ -7063,7 +7267,7 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
*/
__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
{
- if (ops_cpu_valid(cpu, NULL))
+ if (kf_cpu_valid(cpu, NULL))
return arch_scale_cpu_capacity(cpu);
else
return SCX_CPUPERF_ONE;
@@ -7085,7 +7289,7 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
*/
__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
{
- if (ops_cpu_valid(cpu, NULL))
+ if (kf_cpu_valid(cpu, NULL))
return arch_scale_freq_capacity(cpu);
else
return SCX_CPUPERF_ONE;
@@ -7108,18 +7312,37 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
{
if (unlikely(perf > SCX_CPUPERF_ONE)) {
- scx_ops_error("Invalid cpuperf target %u for CPU %d", perf, cpu);
+ scx_kf_error("Invalid cpuperf target %u for CPU %d", perf, cpu);
return;
}
- if (ops_cpu_valid(cpu, NULL)) {
- struct rq *rq = cpu_rq(cpu);
+ if (kf_cpu_valid(cpu, NULL)) {
+ struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq();
+ struct rq_flags rf;
+
+ /*
+ * When called with an rq lock held, restrict the operation
+ * to the corresponding CPU to prevent ABBA deadlocks.
+ */
+ if (locked_rq && rq != locked_rq) {
+ scx_kf_error("Invalid target CPU %d", cpu);
+ return;
+ }
+
+ /*
+ * If no rq lock is held, allow to operate on any CPU by
+ * acquiring the corresponding rq lock.
+ */
+ if (!locked_rq) {
+ rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
+ }
rq->scx.cpuperf_target = perf;
+ cpufreq_update_util(rq, 0);
- rcu_read_lock_sched_notrace();
- cpufreq_update_util(cpu_rq(cpu), 0);
- rcu_read_unlock_sched_notrace();
+ if (!locked_rq)
+ rq_unlock_irqrestore(rq, &rf);
}
}
@@ -7197,7 +7420,7 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p)
*/
__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu)
{
- if (!ops_cpu_valid(cpu, NULL))
+ if (!kf_cpu_valid(cpu, NULL))
return NULL;
return cpu_rq(cpu);
@@ -7293,6 +7516,27 @@ __bpf_kfunc u64 scx_bpf_now(void)
return clock;
}
+static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events)
+{
+ struct scx_event_stats *e_cpu;
+ int cpu;
+
+ /* Aggregate per-CPU event counters into @events. */
+ memset(events, 0, sizeof(*events));
+ for_each_possible_cpu(cpu) {
+ e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu);
+ scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
+ scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+ scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
+ scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
+ scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+ scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL);
+ scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION);
+ scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH);
+ scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE);
+ }
+}
+
/*
* scx_bpf_events - Get a system-wide event counter to
* @events: output buffer from a BPF program
@@ -7301,23 +7545,16 @@ __bpf_kfunc u64 scx_bpf_now(void)
__bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
size_t events__sz)
{
- struct scx_event_stats e_sys, *e_cpu;
- int cpu;
+ struct scx_sched *sch;
+ struct scx_event_stats e_sys;
- /* Aggregate per-CPU event counters into the system-wide counters. */
- memset(&e_sys, 0, sizeof(e_sys));
- for_each_possible_cpu(cpu) {
- e_cpu = per_cpu_ptr(&event_stats_cpu, cpu);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SLICE_DFL);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE);
- }
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (sch)
+ scx_read_events(sch, &e_sys);
+ else
+ memset(&e_sys, 0, sizeof(e_sys));
+ rcu_read_unlock();
/*
* We cannot entirely trust a BPF-provided size since a BPF program
@@ -7350,12 +7587,6 @@ BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
-BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE)
-BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE)
-BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
-BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
-BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 1bda96b19a1b..292bb41a242e 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -8,8 +8,29 @@
*/
#ifdef CONFIG_SCHED_CLASS_EXT
+static inline bool scx_kf_allowed_if_unlocked(void)
+{
+ return !current->scx.kf_mask;
+}
+
+static inline bool scx_rq_bypassing(struct rq *rq)
+{
+ return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
+}
+
DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
+DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
+
+/*
+ * Return the rq currently locked from an scx callback, or NULL if no rq is
+ * locked.
+ */
+static inline struct rq *scx_locked_rq(void)
+{
+ return __this_cpu_read(scx_locked_rq_state);
+}
+
void scx_tick(struct rq *rq);
void init_scx_entity(struct sched_ext_entity *scx);
void scx_pre_fork(struct task_struct *p);
@@ -21,6 +42,7 @@ void scx_rq_activate(struct rq *rq);
void scx_rq_deactivate(struct rq *rq);
int scx_check_setscheduler(struct task_struct *p, int policy);
bool task_should_scx(int policy);
+bool scx_allow_ttwu_queue(const struct task_struct *p);
void init_sched_ext_class(void);
static inline u32 scx_cpuperf_target(s32 cpu)
@@ -36,13 +58,6 @@ static inline bool task_on_scx(const struct task_struct *p)
return scx_enabled() && p->sched_class == &ext_sched_class;
}
-static inline bool scx_allow_ttwu_queue(const struct task_struct *p)
-{
- return !scx_enabled() ||
- static_branch_likely(&scx_ops_allow_queued_wakeup) ||
- p->sched_class != &ext_sched_class;
-}
-
#ifdef CONFIG_SCHED_CORE
bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
bool in_fi);
@@ -66,7 +81,7 @@ static inline void init_sched_ext_class(void) {}
#endif /* CONFIG_SCHED_CLASS_EXT */
-#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP)
+#ifdef CONFIG_SCHED_CLASS_EXT
void __scx_update_idle(struct rq *rq, bool idle, bool do_notify);
static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify)
@@ -80,6 +95,7 @@ static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {}
#ifdef CONFIG_CGROUP_SCHED
#ifdef CONFIG_EXT_GROUP_SCHED
+void scx_tg_init(struct task_group *tg);
int scx_tg_online(struct task_group *tg);
void scx_tg_offline(struct task_group *tg);
int scx_cgroup_can_attach(struct cgroup_taskset *tset);
@@ -88,7 +104,9 @@ void scx_cgroup_finish_attach(void);
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
void scx_group_set_idle(struct task_group *tg, bool idle);
+void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us);
#else /* CONFIG_EXT_GROUP_SCHED */
+static inline void scx_tg_init(struct task_group *tg) {}
static inline int scx_tg_online(struct task_group *tg) { return 0; }
static inline void scx_tg_offline(struct task_group *tg) {}
static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
@@ -97,5 +115,6 @@ static inline void scx_cgroup_finish_attach(void) {}
static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
static inline void scx_group_set_idle(struct task_group *tg, bool idle) {}
+static inline void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us) {}
#endif /* CONFIG_EXT_GROUP_SCHED */
#endif /* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 52c36a70a3d0..7174e1c1a392 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -17,7 +17,6 @@ static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
/* Enable/disable per-node idle cpumasks */
static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node);
-#ifdef CONFIG_SMP
/* Enable/disable LLC aware optimizations */
static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
@@ -47,6 +46,13 @@ static struct scx_idle_cpus scx_idle_global_masks;
static struct scx_idle_cpus **scx_idle_node_masks;
/*
+ * Local per-CPU cpumasks (used to generate temporary idle cpumasks).
+ */
+static DEFINE_PER_CPU(cpumask_var_t, local_idle_cpumask);
+static DEFINE_PER_CPU(cpumask_var_t, local_llc_idle_cpumask);
+static DEFINE_PER_CPU(cpumask_var_t, local_numa_idle_cpumask);
+
+/*
* Return the idle masks associated to a target @node.
*
* NUMA_NO_NODE identifies the global idle cpumask.
@@ -68,7 +74,7 @@ static int scx_cpu_node_if_enabled(int cpu)
return cpu_to_node(cpu);
}
-bool scx_idle_test_and_clear_cpu(int cpu)
+static bool scx_idle_test_and_clear_cpu(int cpu)
{
int node = scx_cpu_node_if_enabled(cpu);
struct cpumask *idle_cpus = idle_cpumask(node)->cpu;
@@ -131,6 +137,7 @@ found:
goto retry;
}
+#ifdef CONFIG_NUMA
/*
* Tracks nodes that have not yet been visited when searching for an idle
* CPU across all available nodes.
@@ -179,11 +186,18 @@ static s32 pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, i
return cpu;
}
+#else
+static inline s32
+pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u64 flags)
+{
+ return -EBUSY;
+}
+#endif
/*
* Find an idle CPU in the system, starting from @node.
*/
-s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags)
+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags)
{
s32 cpu;
@@ -235,7 +249,7 @@ static struct cpumask *llc_span(s32 cpu)
sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (!sd)
- return 0;
+ return NULL;
return sched_domain_span(sd);
}
@@ -392,6 +406,14 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
}
/*
+ * Return true if @p can run on all possible CPUs, false otherwise.
+ */
+static inline bool task_affinity_all(const struct task_struct *p)
+{
+ return p->nr_cpus_allowed >= num_possible_cpus();
+}
+
+/*
* Built-in CPU idle selection policy:
*
* 1. Prioritize full-idle cores:
@@ -403,13 +425,15 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
* branch prediction optimizations.
*
* 3. Pick a CPU within the same LLC (Last-Level Cache):
- * - if the above conditions aren't met, pick a CPU that shares the same LLC
- * to maintain cache locality.
+ * - if the above conditions aren't met, pick a CPU that shares the same
+ * LLC, if the LLC domain is a subset of @cpus_allowed, to maintain
+ * cache locality.
*
* 4. Pick a CPU within the same NUMA node, if enabled:
- * - choose a CPU from the same NUMA node to reduce memory access latency.
+ * - choose a CPU from the same NUMA node, if the node cpumask is a
+ * subset of @cpus_allowed, to reduce memory access latency.
*
- * 5. Pick any idle CPU usable by the task.
+ * 5. Pick any idle CPU within the @cpus_allowed domain.
*
* Step 3 and 4 are performed only if the system has, respectively,
* multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and
@@ -424,35 +448,69 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
* NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
* we never call ops.select_cpu() for them, see select_task_rq().
*/
-s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags)
+s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *cpus_allowed, u64 flags)
{
- const struct cpumask *llc_cpus = NULL;
- const struct cpumask *numa_cpus = NULL;
+ const struct cpumask *llc_cpus = NULL, *numa_cpus = NULL;
+ const struct cpumask *allowed = cpus_allowed ?: p->cpus_ptr;
int node = scx_cpu_node_if_enabled(prev_cpu);
+ bool is_prev_allowed;
s32 cpu;
+ preempt_disable();
+
+ /*
+ * Check whether @prev_cpu is still within the allowed set. If not,
+ * we can still try selecting a nearby CPU.
+ */
+ is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed);
+
+ /*
+ * Determine the subset of CPUs usable by @p within @cpus_allowed.
+ */
+ if (allowed != p->cpus_ptr) {
+ struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_idle_cpumask);
+
+ if (task_affinity_all(p)) {
+ allowed = cpus_allowed;
+ } else if (cpumask_and(local_cpus, cpus_allowed, p->cpus_ptr)) {
+ allowed = local_cpus;
+ } else {
+ cpu = -EBUSY;
+ goto out_enable;
+ }
+ }
+
/*
* This is necessary to protect llc_cpus.
*/
rcu_read_lock();
/*
- * Determine the scheduling domain only if the task is allowed to run
- * on all CPUs.
+ * Determine the subset of CPUs that the task can use in its
+ * current LLC and node.
*
- * This is done primarily for efficiency, as it avoids the overhead of
- * updating a cpumask every time we need to select an idle CPU (which
- * can be costly in large SMP systems), but it also aligns logically:
- * if a task's scheduling domain is restricted by user-space (through
- * CPU affinity), the task will simply use the flat scheduling domain
- * defined by user-space.
+ * If the task can run on all CPUs, use the node and LLC cpumasks
+ * directly.
*/
- if (p->nr_cpus_allowed >= num_possible_cpus()) {
- if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
- numa_cpus = numa_span(prev_cpu);
+ if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) {
+ struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_numa_idle_cpumask);
+ const struct cpumask *cpus = numa_span(prev_cpu);
+
+ if (allowed == p->cpus_ptr && task_affinity_all(p))
+ numa_cpus = cpus;
+ else if (cpus && cpumask_and(local_cpus, allowed, cpus))
+ numa_cpus = local_cpus;
+ }
+
+ if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) {
+ struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_llc_idle_cpumask);
+ const struct cpumask *cpus = llc_span(prev_cpu);
- if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc))
- llc_cpus = llc_span(prev_cpu);
+ if (allowed == p->cpus_ptr && task_affinity_all(p))
+ llc_cpus = cpus;
+ else if (cpus && cpumask_and(local_cpus, allowed, cpus))
+ llc_cpus = local_cpus;
}
/*
@@ -466,7 +524,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
* then avoid a migration.
*/
cpu = smp_processor_id();
- if (cpus_share_cache(cpu, prev_cpu) &&
+ if (is_prev_allowed && cpus_share_cache(cpu, prev_cpu) &&
scx_idle_test_and_clear_cpu(prev_cpu)) {
cpu = prev_cpu;
goto out_unlock;
@@ -490,7 +548,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
cpu_rq(cpu)->scx.local_dsq.nr == 0 &&
(!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) &&
!cpumask_empty(idle_cpumask(waker_node)->cpu)) {
- if (cpumask_test_cpu(cpu, p->cpus_ptr))
+ if (cpumask_test_cpu(cpu, allowed))
goto out_unlock;
}
}
@@ -503,7 +561,8 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
/*
* Keep using @prev_cpu if it's part of a fully idle core.
*/
- if (cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) &&
+ if (is_prev_allowed &&
+ cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) &&
scx_idle_test_and_clear_cpu(prev_cpu)) {
cpu = prev_cpu;
goto out_unlock;
@@ -535,7 +594,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
* begin in prev_cpu's node and proceed to other nodes in
* order of increasing distance.
*/
- cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags | SCX_PICK_IDLE_CORE);
+ cpu = scx_pick_idle_cpu(allowed, node, flags | SCX_PICK_IDLE_CORE);
if (cpu >= 0)
goto out_unlock;
@@ -544,7 +603,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
* core.
*/
if (flags & SCX_PICK_IDLE_CORE) {
- cpu = prev_cpu;
+ cpu = -EBUSY;
goto out_unlock;
}
}
@@ -552,7 +611,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
/*
* Use @prev_cpu if it's idle.
*/
- if (scx_idle_test_and_clear_cpu(prev_cpu)) {
+ if (is_prev_allowed && scx_idle_test_and_clear_cpu(prev_cpu)) {
cpu = prev_cpu;
goto out_unlock;
}
@@ -583,12 +642,12 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
* in prev_cpu's node and proceed to other nodes in order of
* increasing distance.
*/
- cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags);
- if (cpu >= 0)
- goto out_unlock;
+ cpu = scx_pick_idle_cpu(allowed, node, flags);
out_unlock:
rcu_read_unlock();
+out_enable:
+ preempt_enable();
return cpu;
}
@@ -598,7 +657,7 @@ out_unlock:
*/
void scx_idle_init_masks(void)
{
- int node;
+ int i;
/* Allocate global idle cpumasks */
BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL));
@@ -609,13 +668,23 @@ void scx_idle_init_masks(void)
sizeof(*scx_idle_node_masks), GFP_KERNEL);
BUG_ON(!scx_idle_node_masks);
- for_each_node(node) {
- scx_idle_node_masks[node] = kzalloc_node(sizeof(**scx_idle_node_masks),
- GFP_KERNEL, node);
- BUG_ON(!scx_idle_node_masks[node]);
+ for_each_node(i) {
+ scx_idle_node_masks[i] = kzalloc_node(sizeof(**scx_idle_node_masks),
+ GFP_KERNEL, i);
+ BUG_ON(!scx_idle_node_masks[i]);
+
+ BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->cpu, GFP_KERNEL, i));
+ BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->smt, GFP_KERNEL, i));
+ }
- BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->cpu, GFP_KERNEL, node));
- BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->smt, GFP_KERNEL, node));
+ /* Allocate local per-cpu idle cpumasks */
+ for_each_possible_cpu(i) {
+ BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_idle_cpumask, i),
+ GFP_KERNEL, cpu_to_node(i)));
+ BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_llc_idle_cpumask, i),
+ GFP_KERNEL, cpu_to_node(i)));
+ BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_numa_idle_cpumask, i),
+ GFP_KERNEL, cpu_to_node(i)));
}
}
@@ -664,21 +733,12 @@ static void update_builtin_idle(int cpu, bool idle)
*/
void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
{
+ struct scx_sched *sch = scx_root;
int cpu = cpu_of(rq);
lockdep_assert_rq_held(rq);
/*
- * Trigger ops.update_idle() only when transitioning from a task to
- * the idle thread and vice versa.
- *
- * Idle transitions are indicated by do_notify being set to true,
- * managed by put_prev_task_idle()/set_next_task_idle().
- */
- if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
- SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
-
- /*
* Update the idle masks:
* - for real idle transitions (do_notify == true)
* - for idle-to-idle transitions (indicated by the previous task
@@ -695,6 +755,21 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
if (static_branch_likely(&scx_builtin_idle_enabled))
if (do_notify || is_idle_task(rq->curr))
update_builtin_idle(cpu, idle);
+
+ /*
+ * Trigger ops.update_idle() only when transitioning from a task to
+ * the idle thread and vice versa.
+ *
+ * Idle transitions are indicated by do_notify being set to true,
+ * managed by put_prev_task_idle()/set_next_task_idle().
+ *
+ * This must come after builtin idle update so that BPF schedulers can
+ * create interlocking between ops.update_idle() and ops.enqueue() -
+ * either enqueue() sees the idle bit or update_idle() sees the task
+ * that enqueue() queued.
+ */
+ if (SCX_HAS_OP(sch, update_idle) && do_notify && !scx_rq_bypassing(rq))
+ SCX_CALL_OP(sch, SCX_KF_REST, update_idle, rq, cpu_of(rq), idle);
}
static void reset_idle_masks(struct sched_ext_ops *ops)
@@ -718,23 +793,20 @@ static void reset_idle_masks(struct sched_ext_ops *ops)
cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask);
}
}
-#endif /* CONFIG_SMP */
void scx_idle_enable(struct sched_ext_ops *ops)
{
if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))
- static_branch_enable(&scx_builtin_idle_enabled);
+ static_branch_enable_cpuslocked(&scx_builtin_idle_enabled);
else
- static_branch_disable(&scx_builtin_idle_enabled);
+ static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)
- static_branch_enable(&scx_builtin_idle_per_node);
+ static_branch_enable_cpuslocked(&scx_builtin_idle_per_node);
else
- static_branch_disable(&scx_builtin_idle_per_node);
+ static_branch_disable_cpuslocked(&scx_builtin_idle_per_node);
-#ifdef CONFIG_SMP
reset_idle_masks(ops);
-#endif
}
void scx_idle_disable(void)
@@ -750,7 +822,7 @@ void scx_idle_disable(void)
static int validate_node(int node)
{
if (!static_branch_likely(&scx_builtin_idle_per_node)) {
- scx_ops_error("per-node idle tracking is disabled");
+ scx_kf_error("per-node idle tracking is disabled");
return -EOPNOTSUPP;
}
@@ -760,13 +832,13 @@ static int validate_node(int node)
/* Make sure node is in a valid range */
if (node < 0 || node >= nr_node_ids) {
- scx_ops_error("invalid node %d", node);
+ scx_kf_error("invalid node %d", node);
return -EINVAL;
}
/* Make sure the node is part of the set of possible nodes */
if (!node_possible(node)) {
- scx_ops_error("unavailable node %d", node);
+ scx_kf_error("unavailable node %d", node);
return -EINVAL;
}
@@ -780,10 +852,69 @@ static bool check_builtin_idle_enabled(void)
if (static_branch_likely(&scx_builtin_idle_enabled))
return true;
- scx_ops_error("built-in idle tracking is disabled");
+ scx_kf_error("built-in idle tracking is disabled");
return false;
}
+static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *allowed, u64 flags)
+{
+ struct rq *rq;
+ struct rq_flags rf;
+ s32 cpu;
+
+ if (!kf_cpu_valid(prev_cpu, NULL))
+ return -EINVAL;
+
+ if (!check_builtin_idle_enabled())
+ return -EBUSY;
+
+ /*
+ * If called from an unlocked context, acquire the task's rq lock,
+ * so that we can safely access p->cpus_ptr and p->nr_cpus_allowed.
+ *
+ * Otherwise, allow to use this kfunc only from ops.select_cpu()
+ * and ops.select_enqueue().
+ */
+ if (scx_kf_allowed_if_unlocked()) {
+ rq = task_rq_lock(p, &rf);
+ } else {
+ if (!scx_kf_allowed(SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE))
+ return -EPERM;
+ rq = scx_locked_rq();
+ }
+
+ /*
+ * Validate locking correctness to access p->cpus_ptr and
+ * p->nr_cpus_allowed: if we're holding an rq lock, we're safe;
+ * otherwise, assert that p->pi_lock is held.
+ */
+ if (!rq)
+ lockdep_assert_held(&p->pi_lock);
+
+ /*
+ * This may also be called from ops.enqueue(), so we need to handle
+ * per-CPU tasks as well. For these tasks, we can skip all idle CPU
+ * selection optimizations and simply check whether the previously
+ * used CPU is idle and within the allowed cpumask.
+ */
+ if (p->nr_cpus_allowed == 1 || is_migration_disabled(p)) {
+ if (cpumask_test_cpu(prev_cpu, allowed ?: p->cpus_ptr) &&
+ scx_idle_test_and_clear_cpu(prev_cpu))
+ cpu = prev_cpu;
+ else
+ cpu = -EBUSY;
+ } else {
+ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags,
+ allowed ?: p->cpus_ptr, flags);
+ }
+
+ if (scx_kf_allowed_if_unlocked())
+ task_rq_unlock(rq, p, &rf);
+
+ return cpu;
+}
+
/**
* scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or
* trigger an error if @cpu is invalid
@@ -791,14 +922,10 @@ static bool check_builtin_idle_enabled(void)
*/
__bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
{
-#ifdef CONFIG_NUMA
- if (!ops_cpu_valid(cpu, NULL))
+ if (!kf_cpu_valid(cpu, NULL))
return NUMA_NO_NODE;
return cpu_to_node(cpu);
-#else
- return 0;
-#endif
}
/**
@@ -808,9 +935,10 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
* @wake_flags: %SCX_WAKE_* flags
* @is_idle: out parameter indicating whether the returned CPU is idle
*
- * Can only be called from ops.select_cpu() if the built-in CPU selection is
- * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
- * @p, @prev_cpu and @wake_flags match ops.select_cpu().
+ * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked
+ * context such as a BPF test_run() call, as long as built-in CPU selection
+ * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE
+ * is set.
*
* Returns the picked CPU with *@is_idle indicating whether the picked CPU is
* currently idle and thus a good candidate for direct dispatching.
@@ -818,39 +946,52 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
u64 wake_flags, bool *is_idle)
{
-#ifdef CONFIG_SMP
s32 cpu;
-#endif
- if (!ops_cpu_valid(prev_cpu, NULL))
- goto prev_cpu;
-
- if (!check_builtin_idle_enabled())
- goto prev_cpu;
- if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
- goto prev_cpu;
-
-#ifdef CONFIG_SMP
- cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0);
+ cpu = select_cpu_from_kfunc(p, prev_cpu, wake_flags, NULL, 0);
if (cpu >= 0) {
*is_idle = true;
return cpu;
}
-#endif
-
-prev_cpu:
*is_idle = false;
+
return prev_cpu;
}
/**
+ * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p,
+ * prioritizing those in @cpus_allowed
+ * @p: task_struct to select a CPU for
+ * @prev_cpu: CPU @p was on previously
+ * @wake_flags: %SCX_WAKE_* flags
+ * @cpus_allowed: cpumask of allowed CPUs
+ * @flags: %SCX_PICK_IDLE* flags
+ *
+ * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked
+ * context such as a BPF test_run() call, as long as built-in CPU selection
+ * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE
+ * is set.
+ *
+ * @p, @prev_cpu and @wake_flags match ops.select_cpu().
+ *
+ * Returns the selected idle CPU, which will be automatically awakened upon
+ * returning from ops.select_cpu() and can be used for direct dispatch, or
+ * a negative value if no idle CPU is available.
+ */
+__bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *cpus_allowed, u64 flags)
+{
+ return select_cpu_from_kfunc(p, prev_cpu, wake_flags, cpus_allowed, flags);
+}
+
+/**
* scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the
* idle-tracking per-CPU cpumask of a target NUMA node.
* @node: target NUMA node
*
* Returns an empty cpumask if idle tracking is not enabled, if @node is
* not valid, or running on a UP kernel. In this case the actual error will
- * be reported to the BPF scheduler via scx_ops_error().
+ * be reported to the BPF scheduler via scx_error().
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
{
@@ -858,11 +999,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
if (node < 0)
return cpu_none_mask;
-#ifdef CONFIG_SMP
return idle_cpumask(node)->cpu;
-#else
- return cpu_none_mask;
-#endif
}
/**
@@ -875,18 +1012,14 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
{
if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
- scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
return cpu_none_mask;
}
if (!check_builtin_idle_enabled())
return cpu_none_mask;
-#ifdef CONFIG_SMP
return idle_cpumask(NUMA_NO_NODE)->cpu;
-#else
- return cpu_none_mask;
-#endif
}
/**
@@ -897,7 +1030,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
*
* Returns an empty cpumask if idle tracking is not enabled, if @node is
* not valid, or running on a UP kernel. In this case the actual error will
- * be reported to the BPF scheduler via scx_ops_error().
+ * be reported to the BPF scheduler via scx_error().
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
{
@@ -905,14 +1038,10 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
if (node < 0)
return cpu_none_mask;
-#ifdef CONFIG_SMP
if (sched_smt_active())
return idle_cpumask(node)->smt;
else
return idle_cpumask(node)->cpu;
-#else
- return cpu_none_mask;
-#endif
}
/**
@@ -926,21 +1055,17 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
{
if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
- scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
return cpu_none_mask;
}
if (!check_builtin_idle_enabled())
return cpu_none_mask;
-#ifdef CONFIG_SMP
if (sched_smt_active())
return idle_cpumask(NUMA_NO_NODE)->smt;
else
return idle_cpumask(NUMA_NO_NODE)->cpu;
-#else
- return cpu_none_mask;
-#endif
}
/**
@@ -973,10 +1098,10 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
if (!check_builtin_idle_enabled())
return false;
- if (ops_cpu_valid(cpu, NULL))
- return scx_idle_test_and_clear_cpu(cpu);
- else
+ if (!kf_cpu_valid(cpu, NULL))
return false;
+
+ return scx_idle_test_and_clear_cpu(cpu);
}
/**
@@ -1034,7 +1159,7 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
u64 flags)
{
if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
- scx_ops_error("per-node idle tracking is enabled");
+ scx_kf_error("per-node idle tracking is enabled");
return -EBUSY;
}
@@ -1111,7 +1236,7 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
s32 cpu;
if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
- scx_ops_error("per-node idle tracking is enabled");
+ scx_kf_error("per-node idle tracking is enabled");
return -EBUSY;
}
@@ -1142,6 +1267,8 @@ BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_idle)
static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
@@ -1149,21 +1276,11 @@ static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
.set = &scx_kfunc_ids_idle,
};
-BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
-BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
-BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
-
-static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
- .owner = THIS_MODULE,
- .set = &scx_kfunc_ids_select_cpu,
-};
-
int scx_idle_init(void)
{
int ret;
- ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) ||
- register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) ||
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) ||
register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) ||
register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle);
diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h
index 511cc2221f7a..fa583f141f35 100644
--- a/kernel/sched/ext_idle.h
+++ b/kernel/sched/ext_idle.h
@@ -12,22 +12,11 @@
struct sched_ext_ops;
-#ifdef CONFIG_SMP
void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops);
void scx_idle_init_masks(void);
-bool scx_idle_test_and_clear_cpu(int cpu);
-s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags);
-#else /* !CONFIG_SMP */
-static inline void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) {}
-static inline void scx_idle_init_masks(void) {}
-static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; }
-static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags)
-{
- return -EBUSY;
-}
-#endif /* CONFIG_SMP */
-s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags);
+s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *cpus_allowed, u64 flags);
void scx_idle_enable(struct sched_ext_ops *ops);
void scx_idle_disable(void);
int scx_idle_init(void);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e43993a4e580..b173a059315c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -88,7 +88,6 @@ static int __init setup_sched_thermal_decay_shift(char *str)
}
__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
-#ifdef CONFIG_SMP
/*
* For asym packing, by default the lower numbered CPU has higher priority.
*/
@@ -111,7 +110,6 @@ int __weak arch_asym_cpu_priority(int cpu)
* (default: ~5%)
*/
#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
-#endif
#ifdef CONFIG_CFS_BANDWIDTH
/*
@@ -162,7 +160,7 @@ static int __init sched_fair_sysctl_init(void)
return 0;
}
late_initcall(sched_fair_sysctl_init);
-#endif
+#endif /* CONFIG_SYSCTL */
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
@@ -471,7 +469,7 @@ static int se_is_idle(struct sched_entity *se)
return cfs_rq_is_idle(group_cfs_rq(se));
}
-#else /* !CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
#define for_each_sched_entity(se) \
for (; se; se = NULL)
@@ -517,7 +515,7 @@ static int se_is_idle(struct sched_entity *se)
return task_has_idle_policy(task_of(se));
}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
static __always_inline
void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
@@ -884,23 +882,44 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
}
/*
- * HACK, stash a copy of deadline at the point of pick in vlag,
- * which isn't used until dequeue.
+ * Set the vruntime up to which an entity can run before looking
+ * for another entity to pick.
+ * In case of run to parity, we use the shortest slice of the enqueued
+ * entities to set the protected period.
+ * When run to parity is disabled, we give a minimum quantum to the running
+ * entity to ensure progress.
*/
-static inline void set_protect_slice(struct sched_entity *se)
+static inline void set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- se->vlag = se->deadline;
+ u64 slice = normalized_sysctl_sched_base_slice;
+ u64 vprot = se->deadline;
+
+ if (sched_feat(RUN_TO_PARITY))
+ slice = cfs_rq_min_slice(cfs_rq);
+
+ slice = min(slice, se->slice);
+ if (slice != se->slice)
+ vprot = min_vruntime(vprot, se->vruntime + calc_delta_fair(slice, se));
+
+ se->vprot = vprot;
+}
+
+static inline void update_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ u64 slice = cfs_rq_min_slice(cfs_rq);
+
+ se->vprot = min_vruntime(se->vprot, se->vruntime + calc_delta_fair(slice, se));
}
static inline bool protect_slice(struct sched_entity *se)
{
- return se->vlag == se->deadline;
+ return ((s64)(se->vprot - se->vruntime) > 0);
}
static inline void cancel_protect_slice(struct sched_entity *se)
{
if (protect_slice(se))
- se->vlag = se->deadline + 1;
+ se->vprot = se->vruntime;
}
/*
@@ -922,7 +941,7 @@ static inline void cancel_protect_slice(struct sched_entity *se)
*
* Which allows tree pruning through eligibility.
*/
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
{
struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
struct sched_entity *se = __pick_first_entity(cfs_rq);
@@ -939,7 +958,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
curr = NULL;
- if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr))
+ if (curr && protect && protect_slice(curr))
return curr;
/* Pick the leftmost entity if it's eligible */
@@ -983,6 +1002,11 @@ found:
return best;
}
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+{
+ return __pick_eevdf(cfs_rq, true);
+}
+
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
@@ -996,7 +1020,6 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
/**************************************************************
* Scheduling class statistics methods:
*/
-#ifdef CONFIG_SMP
int sched_update_scaling(void)
{
unsigned int factor = get_update_sysctl_factor();
@@ -1008,7 +1031,6 @@ int sched_update_scaling(void)
return 0;
}
-#endif
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
@@ -1041,7 +1063,6 @@ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#include "pelt.h"
-#ifdef CONFIG_SMP
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
@@ -1131,34 +1152,40 @@ void post_init_entity_util_avg(struct task_struct *p)
sa->runnable_avg = sa->util_avg;
}
-#else /* !CONFIG_SMP */
-void init_entity_runnable_average(struct sched_entity *se)
-{
-}
-void post_init_entity_util_avg(struct task_struct *p)
-{
-}
-static void update_tg_load_avg(struct cfs_rq *cfs_rq)
-{
-}
-#endif /* CONFIG_SMP */
-
-static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
+static s64 update_se(struct rq *rq, struct sched_entity *se)
{
u64 now = rq_clock_task(rq);
s64 delta_exec;
- delta_exec = now - curr->exec_start;
+ delta_exec = now - se->exec_start;
if (unlikely(delta_exec <= 0))
return delta_exec;
- curr->exec_start = now;
- curr->sum_exec_runtime += delta_exec;
+ se->exec_start = now;
+ if (entity_is_task(se)) {
+ struct task_struct *donor = task_of(se);
+ struct task_struct *running = rq->curr;
+ /*
+ * If se is a task, we account the time against the running
+ * task, as w/ proxy-exec they may not be the same.
+ */
+ running->se.exec_start = now;
+ running->se.sum_exec_runtime += delta_exec;
+
+ trace_sched_stat_runtime(running, delta_exec);
+ account_group_exec_runtime(running, delta_exec);
+
+ /* cgroup time is always accounted against the donor */
+ cgroup_account_cputime(donor, delta_exec);
+ } else {
+ /* If not task, account the time against donor se */
+ se->sum_exec_runtime += delta_exec;
+ }
if (schedstat_enabled()) {
struct sched_statistics *stats;
- stats = __schedstats_from_se(curr);
+ stats = __schedstats_from_se(se);
__schedstat_set(stats->exec_max,
max(delta_exec, stats->exec_max));
}
@@ -1166,58 +1193,12 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
return delta_exec;
}
-static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
-{
- trace_sched_stat_runtime(p, delta_exec);
- account_group_exec_runtime(p, delta_exec);
- cgroup_account_cputime(p, delta_exec);
-}
-
-static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr)
-{
- if (!sched_feat(PREEMPT_SHORT))
- return false;
-
- if (curr->vlag == curr->deadline)
- return false;
-
- return !entity_eligible(cfs_rq, curr);
-}
-
-static inline bool do_preempt_short(struct cfs_rq *cfs_rq,
- struct sched_entity *pse, struct sched_entity *se)
-{
- if (!sched_feat(PREEMPT_SHORT))
- return false;
-
- if (pse->slice >= se->slice)
- return false;
-
- if (!entity_eligible(cfs_rq, pse))
- return false;
-
- if (entity_before(pse, se))
- return true;
-
- if (!entity_eligible(cfs_rq, se))
- return true;
-
- return false;
-}
-
/*
* Used by other classes to account runtime.
*/
s64 update_curr_common(struct rq *rq)
{
- struct task_struct *donor = rq->donor;
- s64 delta_exec;
-
- delta_exec = update_curr_se(rq, &donor->se);
- if (likely(delta_exec > 0))
- update_curr_task(donor, delta_exec);
-
- return delta_exec;
+ return update_se(rq, &rq->donor->se);
}
/*
@@ -1225,6 +1206,12 @@ s64 update_curr_common(struct rq *rq)
*/
static void update_curr(struct cfs_rq *cfs_rq)
{
+ /*
+ * Note: cfs_rq->curr corresponds to the task picked to
+ * run (ie: rq->donor.se) which due to proxy-exec may
+ * not necessarily be the actual task running
+ * (rq->curr.se). This is easy to confuse!
+ */
struct sched_entity *curr = cfs_rq->curr;
struct rq *rq = rq_of(cfs_rq);
s64 delta_exec;
@@ -1233,7 +1220,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
if (unlikely(!curr))
return;
- delta_exec = update_curr_se(rq, curr);
+ delta_exec = update_se(rq, curr);
if (unlikely(delta_exec <= 0))
return;
@@ -1242,10 +1229,6 @@ static void update_curr(struct cfs_rq *cfs_rq)
update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) {
- struct task_struct *p = task_of(curr);
-
- update_curr_task(p, delta_exec);
-
/*
* If the fair_server is active, we need to account for the
* fair_server time whether or not the task is running on
@@ -1265,7 +1248,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
if (cfs_rq->nr_queued == 1)
return;
- if (resched || did_preempt_short(cfs_rq, curr)) {
+ if (resched || !protect_slice(curr)) {
resched_curr_lazy(rq);
clear_buddies(cfs_rq, curr);
}
@@ -2114,12 +2097,12 @@ static inline int numa_idle_core(int idle_core, int cpu)
return idle_core;
}
-#else
+#else /* !CONFIG_SCHED_SMT: */
static inline int numa_idle_core(int idle_core, int cpu)
{
return idle_core;
}
-#endif
+#endif /* !CONFIG_SCHED_SMT */
/*
* Gather all necessary information to make NUMA balancing placement
@@ -2273,7 +2256,8 @@ static bool task_numa_compare(struct task_numa_env *env,
rcu_read_lock();
cur = rcu_dereference(dst_rq->curr);
- if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
+ if (cur && ((cur->flags & (PF_EXITING | PF_KTHREAD)) ||
+ !cur->mm))
cur = NULL;
/*
@@ -3329,6 +3313,15 @@ static void task_numa_work(struct callback_head *work)
if (p->flags & PF_EXITING)
return;
+ /*
+ * Memory is pinned to only one NUMA node via cpuset.mems, naturally
+ * no page can be migrated.
+ */
+ if (cpusets_enabled() && nodes_weight(cpuset_current_mems_allowed) == 1) {
+ trace_sched_skip_cpuset_numa(current, &cpuset_current_mems_allowed);
+ return;
+ }
+
if (!mm->numa_next_scan) {
mm->numa_next_scan = now +
msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
@@ -3663,7 +3656,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu)
p->numa_scan_period = task_scan_start(p);
}
-#else
+#else /* !CONFIG_NUMA_BALANCING: */
+
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
}
@@ -3680,20 +3674,18 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu)
{
}
-#endif /* CONFIG_NUMA_BALANCING */
+#endif /* !CONFIG_NUMA_BALANCING */
static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
-#ifdef CONFIG_SMP
if (entity_is_task(se)) {
struct rq *rq = rq_of(cfs_rq);
account_numa_enqueue(rq, task_of(se));
list_add(&se->group_node, &rq->cfs_tasks);
}
-#endif
cfs_rq->nr_queued++;
}
@@ -3701,12 +3693,10 @@ static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
-#ifdef CONFIG_SMP
if (entity_is_task(se)) {
account_numa_dequeue(rq_of(cfs_rq), task_of(se));
list_del_init(&se->group_node);
}
-#endif
cfs_rq->nr_queued--;
}
@@ -3758,7 +3748,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
*ptr -= min_t(typeof(*ptr), *ptr, _val); \
} while (0)
-#ifdef CONFIG_SMP
static inline void
enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -3775,12 +3764,6 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}
-#else
-static inline void
-enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
-static inline void
-dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
-#endif
static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
@@ -3795,6 +3778,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
update_entity_lag(cfs_rq, se);
se->deadline -= se->vruntime;
se->rel_deadline = 1;
+ cfs_rq->nr_queued--;
if (!curr)
__dequeue_entity(cfs_rq, se);
update_load_sub(&cfs_rq->load, se->load.weight);
@@ -3811,20 +3795,19 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
update_load_set(&se->load, weight);
-#ifdef CONFIG_SMP
do {
u32 divider = get_pelt_divider(&se->avg);
se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
} while (0);
-#endif
enqueue_load_avg(cfs_rq, se);
if (se->on_rq) {
- update_load_add(&cfs_rq->load, se->load.weight);
place_entity(cfs_rq, se, 0);
+ update_load_add(&cfs_rq->load, se->load.weight);
if (!curr)
__enqueue_entity(cfs_rq, se);
+ cfs_rq->nr_queued++;
/*
* The entity's vruntime has been adjusted, so let's check
@@ -3851,7 +3834,6 @@ static void reweight_task_fair(struct rq *rq, struct task_struct *p,
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_SMP
/*
* All this does is approximate the hierarchical proportion which includes that
* global sum we all love to hate.
@@ -3958,7 +3940,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq)
*/
return clamp_t(long, shares, MIN_SHARES, tg_shares);
}
-#endif /* CONFIG_SMP */
/*
* Recomputes the group entity based on the current state of its group
@@ -3979,20 +3960,16 @@ static void update_cfs_group(struct sched_entity *se)
if (throttled_hierarchy(gcfs_rq))
return;
-#ifndef CONFIG_SMP
- shares = READ_ONCE(gcfs_rq->tg->shares);
-#else
shares = calc_group_shares(gcfs_rq);
-#endif
if (unlikely(se->load.weight != shares))
reweight_entity(cfs_rq_of(se), se, shares);
}
-#else /* CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static inline void update_cfs_group(struct sched_entity *se)
{
}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
{
@@ -4017,7 +3994,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
}
}
-#ifdef CONFIG_SMP
static inline bool load_avg_is_decayed(struct sched_avg *sa)
{
if (sa->load_sum)
@@ -4469,7 +4445,7 @@ static inline bool skip_blocked_update(struct sched_entity *se)
return true;
}
-#else /* CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
@@ -4482,7 +4458,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_NO_HZ_COMMON
static inline void migrate_se_pelt_lag(struct sched_entity *se)
@@ -4563,9 +4539,9 @@ static inline void migrate_se_pelt_lag(struct sched_entity *se)
__update_load_avg_blocked_se(now, se);
}
-#else
+#else /* !CONFIG_NO_HZ_COMMON: */
static void migrate_se_pelt_lag(struct sched_entity *se) {}
-#endif
+#endif /* !CONFIG_NO_HZ_COMMON */
/**
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
@@ -4933,13 +4909,6 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
goto done;
/*
- * To avoid overestimation of actual task utilization, skip updates if
- * we cannot grant there is idle time in this CPU.
- */
- if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
- return;
-
- /*
* To avoid underestimate of task utilization, skip updates of EWMA if
* we cannot grant that thread got all CPU time it wanted.
*/
@@ -5139,48 +5108,6 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
}
-#else /* CONFIG_SMP */
-
-static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
-{
- return !cfs_rq->nr_queued;
-}
-
-#define UPDATE_TG 0x0
-#define SKIP_AGE_LOAD 0x0
-#define DO_ATTACH 0x0
-#define DO_DETACH 0x0
-
-static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
-{
- cfs_rq_util_change(cfs_rq, 0);
-}
-
-static inline void remove_entity_load_avg(struct sched_entity *se) {}
-
-static inline void
-attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline void
-detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-
-static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf)
-{
- return 0;
-}
-
-static inline void
-util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
-
-static inline void
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
-
-static inline void
-util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
- bool task_sleep) {}
-static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
-
-#endif /* CONFIG_SMP */
-
void __setparam_fair(struct task_struct *p, const struct sched_attr *attr)
{
struct sched_entity *se = &p->se;
@@ -5248,7 +5175,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
* = (W*V + w_i*(V - vl_i)) / (W + w_i)
* = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
- * = (V*(W + w_i) - w_i*l) / (W + w_i)
+ * = (V*(W + w_i) - w_i*vl_i) / (W + w_i)
* = V - w_i*vl_i / (W + w_i)
*
* And the actual lag after adding an entity with vl_i is:
@@ -5545,7 +5472,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
__dequeue_entity(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
- set_protect_slice(se);
+ set_protect_slice(cfs_rq, se);
}
update_stats_curr_start(cfs_rq, se);
@@ -5680,7 +5607,7 @@ void cfs_bandwidth_usage_dec(void)
{
static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
}
-#else /* CONFIG_JUMP_LABEL */
+#else /* !CONFIG_JUMP_LABEL: */
static bool cfs_bandwidth_used(void)
{
return true;
@@ -5688,16 +5615,7 @@ static bool cfs_bandwidth_used(void)
void cfs_bandwidth_usage_inc(void) {}
void cfs_bandwidth_usage_dec(void) {}
-#endif /* CONFIG_JUMP_LABEL */
-
-/*
- * default period for cfs group bandwidth.
- * default: 0.1s, units: nanoseconds
- */
-static inline u64 default_cfs_period(void)
-{
- return 100000000ULL;
-}
+#endif /* !CONFIG_JUMP_LABEL */
static inline u64 sched_cfs_bandwidth_slice(void)
{
@@ -5884,7 +5802,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long queued_delta, runnable_delta, idle_delta, dequeue = 1;
- long rq_h_nr_queued = rq->cfs.h_nr_queued;
raw_spin_lock(&cfs_b->lock);
/* This will start the period timer if necessary */
@@ -5968,10 +5885,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
/* At this point se is NULL and we are at root level*/
sub_nr_running(rq, queued_delta);
-
- /* Stop the fair server if throttling resulted in no runnable tasks */
- if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
- dl_server_stop(&rq->fair_server);
done:
/*
* Note: distribution will already see us throttled via the
@@ -6083,7 +5996,6 @@ unthrottle_throttle:
resched_curr(rq);
}
-#ifdef CONFIG_SMP
static void __cfsb_csd_unthrottle(void *arg)
{
struct cfs_rq *cursor, *tmp;
@@ -6142,12 +6054,6 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
if (first)
smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
}
-#else
-static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
-{
- unthrottle_cfs_rq(cfs_rq);
-}
-#endif
static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
{
@@ -6485,8 +6391,6 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-extern const u64 max_cfs_quota_period;
-
static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
@@ -6513,7 +6417,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
* to fail.
*/
new = old * 2;
- if (new < max_cfs_quota_period) {
+ if (new < max_bw_quota_period_us * NSEC_PER_USEC) {
cfs_b->period = ns_to_ktime(new);
cfs_b->quota *= 2;
cfs_b->burst *= 2;
@@ -6547,7 +6451,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *paren
raw_spin_lock_init(&cfs_b->lock);
cfs_b->runtime = 0;
cfs_b->quota = RUNTIME_INF;
- cfs_b->period = ns_to_ktime(default_cfs_period());
+ cfs_b->period = us_to_ktime(default_bw_period_us());
cfs_b->burst = 0;
cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF;
@@ -6603,7 +6507,6 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
* guaranteed at this point that no additional cfs_rq of this group can
* join a CSD list.
*/
-#ifdef CONFIG_SMP
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
unsigned long flags;
@@ -6615,7 +6518,6 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
__cfsb_csd_unthrottle(rq);
local_irq_restore(flags);
}
-#endif
}
/*
@@ -6728,9 +6630,9 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
if (cfs_task_bw_constrained(p))
tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
}
-#endif
+#endif /* CONFIG_NO_HZ_FULL */
-#else /* CONFIG_CFS_BANDWIDTH */
+#else /* !CONFIG_CFS_BANDWIDTH: */
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
@@ -6772,7 +6674,7 @@ bool cfs_task_bw_constrained(struct task_struct *p)
return false;
}
#endif
-#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* !CONFIG_CFS_BANDWIDTH */
#if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL)
static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {}
@@ -6817,7 +6719,7 @@ static void hrtick_update(struct rq *rq)
hrtick_start_fair(rq, donor);
}
-#else /* !CONFIG_SCHED_HRTICK */
+#else /* !CONFIG_SCHED_HRTICK: */
static inline void
hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
@@ -6826,9 +6728,8 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p)
static inline void hrtick_update(struct rq *rq)
{
}
-#endif
+#endif /* !CONFIG_SCHED_HRTICK */
-#ifdef CONFIG_SMP
static inline bool cpu_overutilized(int cpu)
{
unsigned long rq_util_min, rq_util_max;
@@ -6870,9 +6771,6 @@ static inline void check_update_overutilized_status(struct rq *rq)
if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu))
set_rd_overutilized(rq->rd, 1);
}
-#else
-static inline void check_update_overutilized_status(struct rq *rq) { }
-#endif
/* Runqueue only has SCHED_IDLE tasks enqueued */
static int sched_idle_rq(struct rq *rq)
@@ -6881,12 +6779,10 @@ static int sched_idle_rq(struct rq *rq)
rq->nr_running);
}
-#ifdef CONFIG_SMP
static int sched_idle_cpu(int cpu)
{
return sched_idle_rq(cpu_rq(cpu));
}
-#endif
static void
requeue_delayed_entity(struct sched_entity *se)
@@ -6941,7 +6837,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
* Let's add the task's estimated utilization to the cfs_rq's
* estimated utilization, before we update schedutil.
*/
- if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
+ if (!p->se.sched_delayed || (flags & ENQUEUE_DELAYED))
util_est_enqueue(&rq->cfs, p);
if (flags & ENQUEUE_DELAYED) {
@@ -7065,7 +6961,6 @@ static void set_next_buddy(struct sched_entity *se);
static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
{
bool was_sched_idle = sched_idle_rq(rq);
- int rq_h_nr_queued = rq->cfs.h_nr_queued;
bool task_sleep = flags & DEQUEUE_SLEEP;
bool task_delayed = flags & DEQUEUE_DELAYED;
struct task_struct *p = NULL;
@@ -7081,9 +6976,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
h_nr_idle = task_has_idle_policy(p);
if (task_sleep || task_delayed || !se->sched_delayed)
h_nr_runnable = 1;
- } else {
- cfs_rq = group_cfs_rq(se);
- slice = cfs_rq_min_slice(cfs_rq);
}
for_each_sched_entity(se) {
@@ -7093,6 +6985,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
if (p && &p->se == se)
return -1;
+ slice = cfs_rq_min_slice(cfs_rq);
break;
}
@@ -7151,9 +7044,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
sub_nr_running(rq, h_nr_queued);
- if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
- dl_server_stop(&rq->fair_server);
-
/* balance early to pull high priority tasks */
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
rq->next_balance = jiffies;
@@ -7183,7 +7073,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
*/
static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
- if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
+ if (!p->se.sched_delayed)
util_est_dequeue(&rq->cfs, p);
util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
@@ -7198,7 +7088,10 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
return true;
}
-#ifdef CONFIG_SMP
+static inline unsigned int cfs_h_nr_delayed(struct rq *rq)
+{
+ return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable);
+}
/* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */
static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
@@ -7359,8 +7252,12 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
- if (sync && cpu_rq(this_cpu)->nr_running == 1)
- return this_cpu;
+ if (sync) {
+ struct rq *rq = cpu_rq(this_cpu);
+
+ if ((rq->nr_running - cfs_h_nr_delayed(rq)) == 1)
+ return this_cpu;
+ }
if (available_idle_cpu(prev_cpu))
return prev_cpu;
@@ -7665,7 +7562,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
return -1;
}
-#else /* CONFIG_SCHED_SMT */
+#else /* !CONFIG_SCHED_SMT: */
static inline void set_idle_cores(int cpu, int val)
{
@@ -7686,7 +7583,7 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
return -1;
}
-#endif /* CONFIG_SCHED_SMT */
+#endif /* !CONFIG_SCHED_SMT */
/*
* Scan the LLC domain for idle CPUs; this is dynamically regulated by
@@ -8731,9 +8628,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
return sched_balance_newidle(rq, rf) != 0;
}
-#else
-static inline void set_task_max_allowed_capacity(struct task_struct *p) {}
-#endif /* CONFIG_SMP */
static void set_next_buddy(struct sched_entity *se)
{
@@ -8755,6 +8649,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
struct sched_entity *se = &donor->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(donor);
int cse_is_idle, pse_is_idle;
+ bool do_preempt_short = false;
if (unlikely(se == pse))
return;
@@ -8803,7 +8698,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
* When non-idle entity preempt an idle entity,
* don't give idle entity slice protection.
*/
- cancel_protect_slice(se);
+ do_preempt_short = true;
goto preempt;
}
@@ -8821,22 +8716,24 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
/*
* If @p has a shorter slice than current and @p is eligible, override
* current's slice protection in order to allow preemption.
- *
- * Note that even if @p does not turn out to be the most eligible
- * task at this moment, current's slice protection will be lost.
*/
- if (do_preempt_short(cfs_rq, pse, se))
- cancel_protect_slice(se);
+ do_preempt_short = sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice);
/*
* If @p has become the most eligible task, force preemption.
*/
- if (pick_eevdf(cfs_rq) == pse)
+ if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse)
goto preempt;
+ if (sched_feat(RUN_TO_PARITY) && do_preempt_short)
+ update_protect_slice(cfs_rq, se);
+
return;
preempt:
+ if (do_preempt_short)
+ cancel_protect_slice(se);
+
resched_curr_lazy(rq);
}
@@ -8927,7 +8824,7 @@ again:
return p;
simple:
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
put_prev_set_next_task(rq, prev, p);
return p;
@@ -9043,7 +8940,6 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
return true;
}
-#ifdef CONFIG_SMP
/**************************************************
* Fair scheduling class load-balancing methods.
*
@@ -9345,13 +9241,13 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
return src_weight - dst_weight;
}
-#else
+#else /* !CONFIG_NUMA_BALANCING: */
static inline long migrate_degrades_locality(struct task_struct *p,
struct lb_env *env)
{
return 0;
}
-#endif
+#endif /* !CONFIG_NUMA_BALANCING */
/*
* Check whether the task is ineligible on the destination cpu
@@ -9395,7 +9291,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* 2) throttled_lb_pair, or
* 3) cannot be migrated to this CPU due to cpus_ptr, or
* 4) running (obviously), or
- * 5) are cache-hot on their current CPU.
+ * 5) are cache-hot on their current CPU, or
+ * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
*/
if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
return 0;
@@ -9417,6 +9314,9 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (kthread_is_per_cpu(p))
return 0;
+ if (task_is_blocked(p))
+ return 0;
+
if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
int cpu;
@@ -9452,7 +9352,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/* Record that we found at least one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
- if (task_on_cpu(env->src_rq, p)) {
+ if (task_on_cpu(env->src_rq, p) ||
+ task_current_donor(env->src_rq, p)) {
schedstat_inc(p->stats.nr_failed_migrations_running);
return 0;
}
@@ -9496,6 +9397,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
schedstat_inc(p->stats.nr_forced_migrations);
}
+ WARN_ON(task_current(env->src_rq, p));
+ WARN_ON(task_current_donor(env->src_rq, p));
+
deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, env->dst_cpu);
}
@@ -9760,12 +9664,12 @@ static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
if (!has_blocked)
rq->has_blocked_load = 0;
}
-#else
+#else /* !CONFIG_NO_HZ_COMMON: */
static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
static inline bool others_have_blocked(struct rq *rq) { return false; }
static inline void update_blocked_load_tick(struct rq *rq) {}
static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
-#endif
+#endif /* !CONFIG_NO_HZ_COMMON */
static bool __update_blocked_others(struct rq *rq, bool *done)
{
@@ -9874,7 +9778,7 @@ static unsigned long task_h_load(struct task_struct *p)
return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
cfs_rq_load_avg(cfs_rq) + 1);
}
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static bool __update_blocked_fair(struct rq *rq, bool *done)
{
struct cfs_rq *cfs_rq = &rq->cfs;
@@ -9891,7 +9795,7 @@ static unsigned long task_h_load(struct task_struct *p)
{
return p->se.avg.load_avg;
}
-#endif
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
static void sched_balance_update_blocked_averages(int cpu)
{
@@ -10036,9 +9940,9 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
min_capacity = ULONG_MAX;
max_capacity = 0;
- if (child->flags & SD_OVERLAP) {
+ if (child->flags & SD_NUMA) {
/*
- * SD_OVERLAP domains cannot assume that child groups
+ * SD_NUMA domains cannot assume that child groups
* span the current group.
*/
@@ -10051,7 +9955,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
}
} else {
/*
- * !SD_OVERLAP domains can assume that child groups
+ * !SD_NUMA domains can assume that child groups
* span the current group.
*/
@@ -10258,7 +10162,7 @@ sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group
(sgs->group_weight - sgs->idle_cpus != 1))
return false;
- return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu);
+ return sched_asym(env->sd, env->dst_cpu, READ_ONCE(group->asym_prefer_cpu));
}
/* One group has more than one SMT CPU while the other group does not */
@@ -10495,7 +10399,8 @@ static bool update_sd_pick_busiest(struct lb_env *env,
case group_asym_packing:
/* Prefer to move from lowest priority CPU's work */
- return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu);
+ return sched_asym_prefer(READ_ONCE(sds->busiest->asym_prefer_cpu),
+ READ_ONCE(sg->asym_prefer_cpu));
case group_misfit_task:
/*
@@ -10603,7 +10508,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
return remote;
return all;
}
-#else
+#else /* !CONFIG_NUMA_BALANCING: */
static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
{
return all;
@@ -10613,7 +10518,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
{
return regular;
}
-#endif /* CONFIG_NUMA_BALANCING */
+#endif /* !CONFIG_NUMA_BALANCING */
struct sg_lb_stats;
@@ -12161,8 +12066,14 @@ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
/*
* Track max cost of a domain to make sure to not delay the
* next wakeup on the CPU.
+ *
+ * sched_balance_newidle() bumps the cost whenever newidle
+ * balance fails, and we don't want things to grow out of
+ * control. Use the sysctl_sched_migration_cost as the upper
+ * limit, plus a litle extra to avoid off by ones.
*/
- sd->max_newidle_lb_cost = cost;
+ sd->max_newidle_lb_cost =
+ min(cost, sysctl_sched_migration_cost + 200);
sd->last_decay_max_lb_cost = jiffies;
} else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
/*
@@ -12759,7 +12670,7 @@ static void nohz_newidle_balance(struct rq *this_rq)
atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
}
-#else /* !CONFIG_NO_HZ_COMMON */
+#else /* !CONFIG_NO_HZ_COMMON: */
static inline void nohz_balancer_kick(struct rq *rq) { }
static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
@@ -12768,7 +12679,7 @@ static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle
}
static inline void nohz_newidle_balance(struct rq *this_rq) { }
-#endif /* CONFIG_NO_HZ_COMMON */
+#endif /* !CONFIG_NO_HZ_COMMON */
/*
* sched_balance_newidle is called by schedule() if this_cpu is about to become
@@ -12854,10 +12765,17 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
t1 = sched_clock_cpu(this_cpu);
domain_cost = t1 - t0;
- update_newidle_cost(sd, domain_cost);
-
curr_cost += domain_cost;
t0 = t1;
+
+ /*
+ * Failing newidle means it is not effective;
+ * bump the cost so we end up doing less of it.
+ */
+ if (!pulled_task)
+ domain_cost = (3 * sd->max_newidle_lb_cost) / 2;
+
+ update_newidle_cost(sd, domain_cost);
}
/*
@@ -12965,8 +12883,6 @@ static void rq_offline_fair(struct rq *rq)
clear_tg_offline_cfs_rqs(rq);
}
-#endif /* CONFIG_SMP */
-
#ifdef CONFIG_SCHED_CORE
static inline bool
__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
@@ -13063,10 +12979,10 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
cfs_rqa = sea->cfs_rq;
cfs_rqb = seb->cfs_rq;
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
cfs_rqa = &task_rq(a)->cfs;
cfs_rqb = &task_rq(b)->cfs;
-#endif
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
/*
* Find delta after normalizing se's vruntime with its cfs_rq's
@@ -13090,9 +13006,9 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu)
#endif
return throttled_hierarchy(cfs_rq);
}
-#else
+#else /* !CONFIG_SCHED_CORE: */
static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
-#endif
+#endif /* !CONFIG_SCHED_CORE */
/*
* scheduler tick hitting a task of our scheduling class.
@@ -13186,15 +13102,14 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
list_add_leaf_cfs_rq(cfs_rq);
}
}
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static void propagate_entity_cfs_rq(struct sched_entity *se) { }
-#endif
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
static void detach_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
-#ifdef CONFIG_SMP
/*
* In case the task sched_avg hasn't been attached:
* - A forked task which hasn't been woken up by wake_up_new_task().
@@ -13203,7 +13118,6 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
*/
if (!se->avg.last_update_time)
return;
-#endif
/* Catch up with the cfs_rq and remove our load when we leave */
update_load_avg(cfs_rq, se, 0);
@@ -13267,7 +13181,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
{
struct sched_entity *se = &p->se;
-#ifdef CONFIG_SMP
if (task_on_rq_queued(p)) {
/*
* Move the next running task to the front of the list, so our
@@ -13275,7 +13188,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
*/
list_move(&se->group_node, &rq->cfs_tasks);
}
-#endif
if (!first)
return;
@@ -13313,9 +13225,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
-#ifdef CONFIG_SMP
raw_spin_lock_init(&cfs_rq->removed.lock);
-#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -13330,10 +13240,8 @@ static void task_change_group_fair(struct task_struct *p)
detach_task_cfs_rq(p);
-#ifdef CONFIG_SMP
/* Tell se's cfs_rq has been changed -- migrated */
p->se.avg.last_update_time = 0;
-#endif
set_task_rq(p, task_cpu(p));
attach_task_cfs_rq(p);
}
@@ -13629,7 +13537,6 @@ DEFINE_SCHED_CLASS(fair) = {
.put_prev_task = put_prev_task_fair,
.set_next_task = set_next_task_fair,
-#ifdef CONFIG_SMP
.balance = balance_fair,
.select_task_rq = select_task_rq_fair,
.migrate_task_rq = migrate_task_rq_fair,
@@ -13639,7 +13546,6 @@ DEFINE_SCHED_CLASS(fair) = {
.task_dead = task_dead_fair,
.set_cpus_allowed = set_cpus_allowed_fair,
-#endif
.task_tick = task_tick_fair,
.task_fork = task_fork_fair,
@@ -13702,7 +13608,6 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
__init void init_sched_fair_class(void)
{
-#ifdef CONFIG_SMP
int i;
for_each_possible_cpu(i) {
@@ -13724,6 +13629,4 @@ __init void init_sched_fair_class(void)
nohz.next_blocked = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
#endif
-#endif /* SMP */
-
}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 2c85c86b455f..c39b089d4f09 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -6,6 +6,11 @@
* (NOTE: these are not related to SCHED_IDLE batch scheduled
* tasks which are handled in sched/fair.c )
*/
+#include <linux/cpuidle.h>
+#include <linux/suspend.h>
+#include <linux/livepatch.h>
+#include "sched.h"
+#include "smp.h"
/* Linker adds these: start and end of __cpuidle functions */
extern char __cpuidle_text_start[], __cpuidle_text_end[];
@@ -47,7 +52,7 @@ static int __init cpu_idle_nopoll_setup(char *__unused)
return 1;
}
__setup("hlt", cpu_idle_nopoll_setup);
-#endif
+#endif /* CONFIG_GENERIC_IDLE_POLL_SETUP */
static noinline int __cpuidle cpu_idle_poll(void)
{
@@ -95,10 +100,10 @@ static inline void cond_tick_broadcast_exit(void)
if (static_branch_unlikely(&arch_needs_tick_broadcast))
tick_broadcast_exit();
}
-#else
+#else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE: */
static inline void cond_tick_broadcast_enter(void) { }
static inline void cond_tick_broadcast_exit(void) { }
-#endif
+#endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE */
/**
* default_idle_call - Default CPU idle routine.
@@ -427,7 +432,6 @@ void cpu_startup_entry(enum cpuhp_state state)
* idle-task scheduling class.
*/
-#ifdef CONFIG_SMP
static int
select_task_rq_idle(struct task_struct *p, int cpu, int flags)
{
@@ -439,7 +443,6 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
return WARN_ON_ONCE(1);
}
-#endif
/*
* Idle tasks are unconditionally rescheduled:
@@ -526,11 +529,9 @@ DEFINE_SCHED_CLASS(idle) = {
.put_prev_task = put_prev_task_idle,
.set_next_task = set_next_task_idle,
-#ifdef CONFIG_SMP
.balance = balance_idle,
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
-#endif
.task_tick = task_tick_idle,
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 81bc8b329ef1..a4cf17b1fab0 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -7,6 +7,8 @@
* Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
*
*/
+#include <linux/sched/isolation.h>
+#include "sched.h"
enum hk_flags {
HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN),
@@ -40,7 +42,7 @@ int housekeeping_any_cpu(enum hk_type type)
if (cpu < nr_cpu_ids)
return cpu;
- cpu = cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask);
+ cpu = cpumask_any_and_distribute(housekeeping.cpumasks[type], cpu_online_mask);
if (likely(cpu < nr_cpu_ids))
return cpu;
/*
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index c48900b856a2..b601e0243d0e 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -6,6 +6,8 @@
* figure. Its a silly number but people think its important. We go through
* great pains to make it work on big machines and tickless kernels.
*/
+#include <linux/sched/nohz.h>
+#include "sched.h"
/*
* Global load-average calculations
@@ -80,7 +82,7 @@ long calc_load_fold_active(struct rq *this_rq, long adjust)
long nr_active, delta = 0;
nr_active = this_rq->nr_running - adjust;
- nr_active += (int)this_rq->nr_uninterruptible;
+ nr_active += (long)this_rq->nr_uninterruptible;
if (nr_active != this_rq->calc_load_active) {
delta = nr_active - this_rq->calc_load_active;
@@ -333,12 +335,12 @@ static void calc_global_nohz(void)
smp_wmb();
calc_load_idx++;
}
-#else /* !CONFIG_NO_HZ_COMMON */
+#else /* !CONFIG_NO_HZ_COMMON: */
static inline long calc_load_nohz_read(void) { return 0; }
static inline void calc_global_nohz(void) { }
-#endif /* CONFIG_NO_HZ_COMMON */
+#endif /* !CONFIG_NO_HZ_COMMON */
/*
* calc_load - update the avenrun load estimates 10 ticks after the
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 809194cd779f..62fba83b7bb1 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -4,6 +4,8 @@
*
* membarrier system call
*/
+#include <uapi/linux/membarrier.h>
+#include "sched.h"
/*
* For documentation purposes, here are some membarrier ordering
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 7a8534a2deff..fa83bbaf4f3e 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -23,6 +23,7 @@
* Move PELT related code from fair.c into this pelt.c file
* Author: Vincent Guittot <vincent.guittot@linaro.org>
*/
+#include "pelt.h"
/*
* Approximate:
@@ -413,7 +414,7 @@ int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
return 0;
}
-#endif
+#endif /* CONFIG_SCHED_HW_PRESSURE */
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
/*
@@ -466,7 +467,7 @@ int update_irq_load_avg(struct rq *rq, u64 running)
return ret;
}
-#endif
+#endif /* CONFIG_HAVE_SCHED_AVG_IRQ */
/*
* Load avg and utiliztion metrics need to be updated periodically and before
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index f4f6a0875c66..62c3fa543c0f 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -1,4 +1,8 @@
-#ifdef CONFIG_SMP
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _KERNEL_SCHED_PELT_H
+#define _KERNEL_SCHED_PELT_H
+#include "sched.h"
+
#include "sched-pelt.h"
int __update_load_avg_blocked_se(u64 now, struct sched_entity *se);
@@ -15,7 +19,7 @@ static inline u64 hw_load_avg(struct rq *rq)
{
return READ_ONCE(rq->avg_hw.load_avg);
}
-#else
+#else /* !CONFIG_SCHED_HW_PRESSURE: */
static inline int
update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
{
@@ -26,7 +30,7 @@ static inline u64 hw_load_avg(struct rq *rq)
{
return 0;
}
-#endif
+#endif /* !CONFIG_SCHED_HW_PRESSURE */
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
int update_irq_load_avg(struct rq *rq, u64 running);
@@ -174,63 +178,12 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
}
-#else
+#else /* !CONFIG_CFS_BANDWIDTH: */
static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
return rq_clock_pelt(rq_of(cfs_rq));
}
-#endif
-
-#else
-
-static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
-{
- return 0;
-}
-
-static inline int
-update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
-{
- return 0;
-}
-
-static inline int
-update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
-{
- return 0;
-}
-
-static inline int
-update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
-{
- return 0;
-}
-
-static inline u64 hw_load_avg(struct rq *rq)
-{
- return 0;
-}
-
-static inline int
-update_irq_load_avg(struct rq *rq, u64 running)
-{
- return 0;
-}
-
-static inline u64 rq_clock_pelt(struct rq *rq)
-{
- return rq_clock_task(rq);
-}
-
-static inline void
-update_rq_clock_pelt(struct rq *rq, s64 delta) { }
-
-static inline void
-update_idle_rq_clock_pelt(struct rq *rq) { }
-
-static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
-#endif
-
+#endif /* !CONFIG_CFS_BANDWIDTH */
+#endif /* _KERNEL_SCHED_PELT_H */
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index bb56805e3d47..59fdb7ebbf22 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -136,6 +136,10 @@
* cost-wise, yet way more sensitive and accurate than periodic
* sampling of the aggregate task states would be.
*/
+#include <linux/sched/clock.h>
+#include <linux/workqueue.h>
+#include <linux/psi.h>
+#include "sched.h"
static int psi_bug __read_mostly;
@@ -172,17 +176,35 @@ struct psi_group psi_system = {
.pcpu = &system_group_pcpu,
};
+static DEFINE_PER_CPU(seqcount_t, psi_seq) = SEQCNT_ZERO(psi_seq);
+
+static inline void psi_write_begin(int cpu)
+{
+ write_seqcount_begin(per_cpu_ptr(&psi_seq, cpu));
+}
+
+static inline void psi_write_end(int cpu)
+{
+ write_seqcount_end(per_cpu_ptr(&psi_seq, cpu));
+}
+
+static inline u32 psi_read_begin(int cpu)
+{
+ return read_seqcount_begin(per_cpu_ptr(&psi_seq, cpu));
+}
+
+static inline bool psi_read_retry(int cpu, u32 seq)
+{
+ return read_seqcount_retry(per_cpu_ptr(&psi_seq, cpu), seq);
+}
+
static void psi_avgs_work(struct work_struct *work);
static void poll_timer_fn(struct timer_list *t);
static void group_init(struct psi_group *group)
{
- int cpu;
-
group->enabled = true;
- for_each_possible_cpu(cpu)
- seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
group->avg_last_update = sched_clock();
group->avg_next_update = group->avg_last_update + psi_period;
mutex_init(&group->avgs_lock);
@@ -262,14 +284,14 @@ static void get_recent_times(struct psi_group *group, int cpu,
/* Snapshot a coherent view of the CPU state */
do {
- seq = read_seqcount_begin(&groupc->seq);
+ seq = psi_read_begin(cpu);
now = cpu_clock(cpu);
memcpy(times, groupc->times, sizeof(groupc->times));
state_mask = groupc->state_mask;
state_start = groupc->state_start;
if (cpu == current_cpu)
memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
- } while (read_seqcount_retry(&groupc->seq, seq));
+ } while (psi_read_retry(cpu, seq));
/* Calculate state time deltas against the previous snapshot */
for (s = 0; s < NR_PSI_STATES; s++) {
@@ -733,7 +755,7 @@ static int psi_rtpoll_worker(void *data)
static void poll_timer_fn(struct timer_list *t)
{
- struct psi_group *group = from_timer(group, t, rtpoll_timer);
+ struct psi_group *group = timer_container_of(group, t, rtpoll_timer);
atomic_set(&group->rtpoll_wakeup, 1);
wake_up_interruptible(&group->rtpoll_wait);
@@ -768,31 +790,21 @@ static void record_times(struct psi_group_cpu *groupc, u64 now)
groupc->times[PSI_NONIDLE] += delta;
}
+#define for_each_group(iter, group) \
+ for (typeof(group) iter = group; iter; iter = iter->parent)
+
static void psi_group_change(struct psi_group *group, int cpu,
unsigned int clear, unsigned int set,
- bool wake_clock)
+ u64 now, bool wake_clock)
{
struct psi_group_cpu *groupc;
unsigned int t, m;
u32 state_mask;
- u64 now;
lockdep_assert_rq_held(cpu_rq(cpu));
groupc = per_cpu_ptr(group->pcpu, cpu);
/*
- * First we update the task counts according to the state
- * change requested through the @clear and @set bits.
- *
- * Then if the cgroup PSI stats accounting enabled, we
- * assess the aggregate resource states this CPU's tasks
- * have been in since the last change, and account any
- * SOME and FULL time these may have resulted in.
- */
- write_seqcount_begin(&groupc->seq);
- now = cpu_clock(cpu);
-
- /*
* Start with TSK_ONCPU, which doesn't have a corresponding
* task count - it's just a boolean flag directly encoded in
* the state mask. Clear, set, or carry the current state if
@@ -843,7 +855,6 @@ static void psi_group_change(struct psi_group *group, int cpu,
groupc->state_mask = state_mask;
- write_seqcount_end(&groupc->seq);
return;
}
@@ -864,8 +875,6 @@ static void psi_group_change(struct psi_group *group, int cpu,
groupc->state_mask = state_mask;
- write_seqcount_end(&groupc->seq);
-
if (state_mask & group->rtpoll_states)
psi_schedule_rtpoll_work(group, 1, false);
@@ -900,24 +909,29 @@ static void psi_flags_change(struct task_struct *task, int clear, int set)
void psi_task_change(struct task_struct *task, int clear, int set)
{
int cpu = task_cpu(task);
- struct psi_group *group;
+ u64 now;
if (!task->pid)
return;
psi_flags_change(task, clear, set);
- group = task_psi_group(task);
- do {
- psi_group_change(group, cpu, clear, set, true);
- } while ((group = group->parent));
+ psi_write_begin(cpu);
+ now = cpu_clock(cpu);
+ for_each_group(group, task_psi_group(task))
+ psi_group_change(group, cpu, clear, set, now, true);
+ psi_write_end(cpu);
}
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep)
{
- struct psi_group *group, *common = NULL;
+ struct psi_group *common = NULL;
int cpu = task_cpu(prev);
+ u64 now;
+
+ psi_write_begin(cpu);
+ now = cpu_clock(cpu);
if (next->pid) {
psi_flags_change(next, 0, TSK_ONCPU);
@@ -926,16 +940,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
* ancestors with @prev, those will already have @prev's
* TSK_ONCPU bit set, and we can stop the iteration there.
*/
- group = task_psi_group(next);
- do {
- if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
- PSI_ONCPU) {
+ for_each_group(group, task_psi_group(next)) {
+ struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+
+ if (groupc->state_mask & PSI_ONCPU) {
common = group;
break;
}
-
- psi_group_change(group, cpu, 0, TSK_ONCPU, true);
- } while ((group = group->parent));
+ psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
+ }
}
if (prev->pid) {
@@ -968,12 +981,11 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
psi_flags_change(prev, clear, set);
- group = task_psi_group(prev);
- do {
+ for_each_group(group, task_psi_group(prev)) {
if (group == common)
break;
- psi_group_change(group, cpu, clear, set, wake_clock);
- } while ((group = group->parent));
+ psi_group_change(group, cpu, clear, set, now, wake_clock);
+ }
/*
* TSK_ONCPU is handled up to the common ancestor. If there are
@@ -983,20 +995,21 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
*/
if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
clear &= ~TSK_ONCPU;
- for (; group; group = group->parent)
- psi_group_change(group, cpu, clear, set, wake_clock);
+ for_each_group(group, common)
+ psi_group_change(group, cpu, clear, set, now, wake_clock);
}
}
+ psi_write_end(cpu);
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
{
int cpu = task_cpu(curr);
- struct psi_group *group;
struct psi_group_cpu *groupc;
s64 delta;
u64 irq;
+ u64 now;
if (static_branch_likely(&psi_disabled) || !irqtime_enabled())
return;
@@ -1005,8 +1018,7 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
return;
lockdep_assert_rq_held(rq);
- group = task_psi_group(curr);
- if (prev && task_psi_group(prev) == group)
+ if (prev && task_psi_group(prev) == task_psi_group(curr))
return;
irq = irq_time_read(cpu);
@@ -1015,27 +1027,24 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
return;
rq->psi_irq_time = irq;
- do {
- u64 now;
+ psi_write_begin(cpu);
+ now = cpu_clock(cpu);
+ for_each_group(group, task_psi_group(curr)) {
if (!group->enabled)
continue;
groupc = per_cpu_ptr(group->pcpu, cpu);
- write_seqcount_begin(&groupc->seq);
- now = cpu_clock(cpu);
-
record_times(groupc, now);
groupc->times[PSI_IRQ_FULL] += delta;
- write_seqcount_end(&groupc->seq);
-
if (group->rtpoll_states & (1 << PSI_IRQ_FULL))
psi_schedule_rtpoll_work(group, 1, false);
- } while ((group = group->parent));
+ }
+ psi_write_end(cpu);
}
-#endif
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
/**
* psi_memstall_enter - mark the beginning of a memory stall section
@@ -1221,12 +1230,14 @@ void psi_cgroup_restart(struct psi_group *group)
return;
for_each_possible_cpu(cpu) {
- struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
+ u64 now;
- rq_lock_irq(rq, &rf);
- psi_group_change(group, cpu, 0, 0, true);
- rq_unlock_irq(rq, &rf);
+ guard(rq_lock_irq)(cpu_rq(cpu));
+
+ psi_write_begin(cpu);
+ now = cpu_clock(cpu);
+ psi_group_change(group, cpu, 0, 0, now, true);
+ psi_write_end(cpu);
}
}
#endif /* CONFIG_CGROUPS */
@@ -1440,7 +1451,7 @@ void psi_trigger_destroy(struct psi_trigger *t)
group->rtpoll_task,
lockdep_is_held(&group->rtpoll_trigger_lock));
rcu_assign_pointer(group->rtpoll_task, NULL);
- del_timer(&group->rtpoll_timer);
+ timer_delete(&group->rtpoll_timer);
}
}
mutex_unlock(&group->rtpoll_trigger_lock);
@@ -1651,7 +1662,7 @@ static const struct proc_ops psi_irq_proc_ops = {
.proc_poll = psi_fop_poll,
.proc_release = psi_fop_release,
};
-#endif
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
static int __init psi_proc_init(void)
{
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index fa03ec3ed56a..7936d4333731 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -4,6 +4,9 @@
* policies)
*/
+#include "sched.h"
+#include "pelt.h"
+
int sched_rr_timeslice = RR_TIMESLICE;
/* More than 4 hours if BW_SHIFT equals 20. */
static const u64 max_rt_runtime = MAX_BW;
@@ -60,7 +63,7 @@ static int __init sched_rt_sysctl_init(void)
return 0;
}
late_initcall(sched_rt_sysctl_init);
-#endif
+#endif /* CONFIG_SYSCTL */
void init_rt_rq(struct rt_rq *rt_rq)
{
@@ -75,12 +78,10 @@ void init_rt_rq(struct rt_rq *rt_rq)
/* delimiter for bitsearch: */
__set_bit(MAX_RT_PRIO, array->bitmap);
-#if defined CONFIG_SMP
rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
rt_rq->highest_prio.next = MAX_RT_PRIO-1;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
-#endif /* CONFIG_SMP */
/* We start is dequeued state, because no RT tasks are queued */
rt_rq->rt_queued = 0;
@@ -89,6 +90,7 @@ void init_rt_rq(struct rt_rq *rt_rq)
rt_rq->rt_throttled = 0;
rt_rq->rt_runtime = 0;
raw_spin_lock_init(&rt_rq->rt_runtime_lock);
+ rt_rq->tg = &root_task_group;
#endif
}
@@ -175,11 +177,14 @@ static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
{
+ /* Cannot fold with non-CONFIG_RT_GROUP_SCHED version, layout */
+ WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group);
return rt_rq->rq;
}
static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
{
+ WARN_ON(!rt_group_sched_enabled() && rt_se->rt_rq->tg != &root_task_group);
return rt_se->rt_rq;
}
@@ -187,11 +192,15 @@ static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
{
struct rt_rq *rt_rq = rt_se->rt_rq;
+ WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group);
return rt_rq->rq;
}
void unregister_rt_sched_group(struct task_group *tg)
{
+ if (!rt_group_sched_enabled())
+ return;
+
if (tg->rt_se)
destroy_rt_bandwidth(&tg->rt_bandwidth);
}
@@ -200,6 +209,9 @@ void free_rt_sched_group(struct task_group *tg)
{
int i;
+ if (!rt_group_sched_enabled())
+ return;
+
for_each_possible_cpu(i) {
if (tg->rt_rq)
kfree(tg->rt_rq[i]);
@@ -244,6 +256,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
struct sched_rt_entity *rt_se;
int i;
+ if (!rt_group_sched_enabled())
+ return 1;
+
tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
if (!tg->rt_rq)
goto err;
@@ -277,7 +292,7 @@ err:
return 0;
}
-#else /* CONFIG_RT_GROUP_SCHED */
+#else /* !CONFIG_RT_GROUP_SCHED: */
#define rt_entity_is_task(rt_se) (1)
@@ -313,9 +328,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
{
return 1;
}
-#endif /* CONFIG_RT_GROUP_SCHED */
-
-#ifdef CONFIG_SMP
+#endif /* !CONFIG_RT_GROUP_SCHED */
static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
{
@@ -416,21 +429,6 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
}
}
-#else
-
-static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
-{
-}
-
-static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
-{
-}
-
-static inline void rt_queue_push_tasks(struct rq *rq)
-{
-}
-#endif /* CONFIG_SMP */
-
static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count);
@@ -471,20 +469,17 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
return cpu_cap >= min(min_cap, max_cap);
}
-#else
+#else /* !CONFIG_UCLAMP_TASK: */
static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
{
return true;
}
-#endif
+#endif /* !CONFIG_UCLAMP_TASK */
#ifdef CONFIG_RT_GROUP_SCHED
static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
{
- if (!rt_rq->tg)
- return RUNTIME_INF;
-
return rt_rq->rt_runtime;
}
@@ -497,6 +492,11 @@ typedef struct task_group *rt_rq_iter_t;
static inline struct task_group *next_task_group(struct task_group *tg)
{
+ if (!rt_group_sched_enabled()) {
+ WARN_ON(tg != &root_task_group);
+ return NULL;
+ }
+
do {
tg = list_entry_rcu(tg->list.next,
typeof(struct task_group), list);
@@ -509,9 +509,9 @@ static inline struct task_group *next_task_group(struct task_group *tg)
}
#define for_each_rt_rq(rt_rq, iter, rq) \
- for (iter = container_of(&task_groups, typeof(*iter), list); \
- (iter = next_task_group(iter)) && \
- (rt_rq = iter->rt_rq[cpu_of(rq)]);)
+ for (iter = &root_task_group; \
+ iter && (rt_rq = iter->rt_rq[cpu_of(rq)]); \
+ iter = next_task_group(iter))
#define for_each_sched_rt_entity(rt_se) \
for (; rt_se; rt_se = rt_se->parent)
@@ -578,17 +578,10 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
return p->prio != p->normal_prio;
}
-#ifdef CONFIG_SMP
static inline const struct cpumask *sched_rt_period_mask(void)
{
return this_rq()->rd->span;
}
-#else
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
- return cpu_online_mask;
-}
-#endif
static inline
struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
@@ -609,7 +602,6 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
rt_rq->rt_time < rt_b->rt_runtime);
}
-#ifdef CONFIG_SMP
/*
* We ran out of runtime, see if we can borrow some from our neighbours.
*/
@@ -782,9 +774,6 @@ static void balance_runtime(struct rt_rq *rt_rq)
raw_spin_lock(&rt_rq->rt_runtime_lock);
}
}
-#else /* !CONFIG_SMP */
-static inline void balance_runtime(struct rt_rq *rt_rq) {}
-#endif /* CONFIG_SMP */
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
{
@@ -914,7 +903,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
return 0;
}
-#else /* !CONFIG_RT_GROUP_SCHED */
+#else /* !CONFIG_RT_GROUP_SCHED: */
typedef struct rt_rq *rt_rq_iter_t;
@@ -961,12 +950,10 @@ struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
return &cpu_rq(cpu)->rt;
}
-#ifdef CONFIG_SMP
static void __enable_runtime(struct rq *rq) { }
static void __disable_runtime(struct rq *rq) { }
-#endif
-#endif /* CONFIG_RT_GROUP_SCHED */
+#endif /* !CONFIG_RT_GROUP_SCHED */
static inline int rt_se_prio(struct sched_rt_entity *rt_se)
{
@@ -1017,7 +1004,7 @@ static void update_curr_rt(struct rq *rq)
do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
}
}
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
}
static void
@@ -1059,20 +1046,17 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq)
cpufreq_update_util(rq, 0);
}
-#if defined CONFIG_SMP
-
static void
inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
-#ifdef CONFIG_RT_GROUP_SCHED
/*
* Change rq's cpupri only if rt_rq is the top queue.
*/
- if (&rq->rt != rt_rq)
+ if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq)
return;
-#endif
+
if (rq->online && prio < prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
}
@@ -1082,27 +1066,16 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
-#ifdef CONFIG_RT_GROUP_SCHED
/*
* Change rq's cpupri only if rt_rq is the top queue.
*/
- if (&rq->rt != rt_rq)
+ if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq)
return;
-#endif
+
if (rq->online && rt_rq->highest_prio.curr != prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
}
-#else /* CONFIG_SMP */
-
-static inline
-void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
-static inline
-void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
-
-#endif /* CONFIG_SMP */
-
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
static void
inc_rt_prio(struct rt_rq *rt_rq, int prio)
{
@@ -1141,13 +1114,6 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
dec_rt_prio_smp(rt_rq, prio, prev_prio);
}
-#else
-
-static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
-static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
-
-#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
-
#ifdef CONFIG_RT_GROUP_SCHED
static void
@@ -1156,8 +1122,7 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
if (rt_se_boosted(rt_se))
rt_rq->rt_nr_boosted++;
- if (rt_rq->tg)
- start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
+ start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
}
static void
@@ -1169,7 +1134,7 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
}
-#else /* CONFIG_RT_GROUP_SCHED */
+#else /* !CONFIG_RT_GROUP_SCHED: */
static void
inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
@@ -1179,7 +1144,7 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
static inline
void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
-#endif /* CONFIG_RT_GROUP_SCHED */
+#endif /* !CONFIG_RT_GROUP_SCHED */
static inline
unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
@@ -1257,11 +1222,9 @@ static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_arr
static inline struct sched_statistics *
__schedstats_from_rt_se(struct sched_rt_entity *rt_se)
{
-#ifdef CONFIG_RT_GROUP_SCHED
/* schedstats is not supported for rt group. */
if (!rt_entity_is_task(rt_se))
return NULL;
-#endif
return &rt_task_of(rt_se)->stats;
}
@@ -1477,6 +1440,9 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
enqueue_rt_entity(rt_se, flags);
+ if (task_is_blocked(p))
+ return;
+
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}
@@ -1527,7 +1493,6 @@ static void yield_task_rt(struct rq *rq)
requeue_task_rt(rq, rq->curr, 0);
}
-#ifdef CONFIG_SMP
static int find_lowest_rq(struct task_struct *task);
static int
@@ -1642,7 +1607,6 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq);
}
-#endif /* CONFIG_SMP */
/*
* Preempt the current task with a newly woken task if needed:
@@ -1656,7 +1620,6 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
return;
}
-#ifdef CONFIG_SMP
/*
* If:
*
@@ -1671,7 +1634,6 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
*/
if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr))
check_preempt_equal_prio(rq, p);
-#endif
}
static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
@@ -1757,6 +1719,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_s
update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
+ if (task_is_blocked(p))
+ return;
/*
* The previous task needs to be made eligible for pushing
* if it is still active
@@ -1765,8 +1729,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_s
enqueue_pushable_task(rq, p);
}
-#ifdef CONFIG_SMP
-
/* Only try algorithms three times */
#define RT_MAX_TRIES 3
@@ -1883,6 +1845,27 @@ static int find_lowest_rq(struct task_struct *task)
return -1;
}
+static struct task_struct *pick_next_pushable_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_tasks(rq))
+ return NULL;
+
+ p = plist_first_entry(&rq->rt.pushable_tasks,
+ struct task_struct, pushable_tasks);
+
+ BUG_ON(rq->cpu != task_cpu(p));
+ BUG_ON(task_current(rq, p));
+ BUG_ON(task_current_donor(rq, p));
+ BUG_ON(p->nr_cpus_allowed <= 1);
+
+ BUG_ON(!task_on_rq_queued(p));
+ BUG_ON(!rt_task(p));
+
+ return p;
+}
+
/* Will lock the rq it finds */
static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
{
@@ -1913,18 +1896,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
/*
* We had to unlock the run queue. In
* the mean time, task could have
- * migrated already or had its affinity changed.
- * Also make sure that it wasn't scheduled on its rq.
+ * migrated already or had its affinity changed,
+ * therefore check if the task is still at the
+ * head of the pushable tasks list.
* It is possible the task was scheduled, set
* "migrate_disabled" and then got preempted, so we must
* check the task migration disable flag here too.
*/
- if (unlikely(task_rq(task) != rq ||
+ if (unlikely(is_migration_disabled(task) ||
!cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
- task_on_cpu(rq, task) ||
- !rt_task(task) ||
- is_migration_disabled(task) ||
- !task_on_rq_queued(task))) {
+ task != pick_next_pushable_task(rq))) {
double_unlock_balance(rq, lowest_rq);
lowest_rq = NULL;
@@ -1944,27 +1925,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
return lowest_rq;
}
-static struct task_struct *pick_next_pushable_task(struct rq *rq)
-{
- struct task_struct *p;
-
- if (!has_pushable_tasks(rq))
- return NULL;
-
- p = plist_first_entry(&rq->rt.pushable_tasks,
- struct task_struct, pushable_tasks);
-
- BUG_ON(rq->cpu != task_cpu(p));
- BUG_ON(task_current(rq, p));
- BUG_ON(task_current_donor(rq, p));
- BUG_ON(p->nr_cpus_allowed <= 1);
-
- BUG_ON(!task_on_rq_queued(p));
- BUG_ON(!rt_task(p));
-
- return p;
-}
-
/*
* If the current CPU has more than one RT task, see if the non
* running task can migrate over to a CPU that is running a task
@@ -2442,7 +2402,6 @@ void __init init_sched_rt_class(void)
GFP_KERNEL, cpu_to_node(i));
}
}
-#endif /* CONFIG_SMP */
/*
* When switching a task to RT, we may overload the runqueue
@@ -2466,10 +2425,8 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
* then see if we can move to another run queue.
*/
if (task_on_rq_queued(p)) {
-#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
rt_queue_push_tasks(rq);
-#endif /* CONFIG_SMP */
if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq)))
resched_curr(rq);
}
@@ -2486,7 +2443,6 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
return;
if (task_current_donor(rq, p)) {
-#ifdef CONFIG_SMP
/*
* If our priority decreases while running, we
* may need to pull tasks to this runqueue.
@@ -2500,11 +2456,6 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
*/
if (p->prio > rq->rt.highest_prio.curr)
resched_curr(rq);
-#else
- /* For UP simply resched on drop of prio */
- if (oldprio < p->prio)
- resched_curr(rq);
-#endif /* CONFIG_SMP */
} else {
/*
* This task is not running, but if it is
@@ -2540,9 +2491,9 @@ static void watchdog(struct rq *rq, struct task_struct *p)
}
}
}
-#else
+#else /* !CONFIG_POSIX_TIMERS: */
static inline void watchdog(struct rq *rq, struct task_struct *p) { }
-#endif
+#endif /* !CONFIG_POSIX_TIMERS */
/*
* scheduler tick hitting a task of our scheduling class.
@@ -2602,15 +2553,16 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu)
{
struct rt_rq *rt_rq;
-#ifdef CONFIG_RT_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED // XXX maybe add task_rt_rq(), see also sched_rt_period_rt_rq
rt_rq = task_group(p)->rt_rq[cpu];
+ WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group);
#else
rt_rq = &cpu_rq(cpu)->rt;
#endif
return rt_rq_throttled(rt_rq);
}
-#endif
+#endif /* CONFIG_SCHED_CORE */
DEFINE_SCHED_CLASS(rt) = {
@@ -2624,7 +2576,6 @@ DEFINE_SCHED_CLASS(rt) = {
.put_prev_task = put_prev_task_rt,
.set_next_task = set_next_task_rt,
-#ifdef CONFIG_SMP
.balance = balance_rt,
.select_task_rq = select_task_rq_rt,
.set_cpus_allowed = set_cpus_allowed_common,
@@ -2633,7 +2584,6 @@ DEFINE_SCHED_CLASS(rt) = {
.task_woken = task_woken_rt,
.switched_from = switched_from_rt,
.find_lock_rq = find_lock_lowest_rq,
-#endif
.task_tick = task_tick_rt,
@@ -2713,6 +2663,9 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
return -EBUSY;
+ if (WARN_ON(!rt_group_sched_enabled() && tg != &root_task_group))
+ return -EBUSY;
+
total = to_ratio(period, runtime);
/*
@@ -2868,13 +2821,13 @@ static int sched_rt_global_constraints(void)
int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
{
/* Don't accept real-time tasks when there is no way for them to run */
- if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
+ if (rt_group_sched_enabled() && rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
return 0;
return 1;
}
-#else /* !CONFIG_RT_GROUP_SCHED */
+#else /* !CONFIG_RT_GROUP_SCHED: */
#ifdef CONFIG_SYSCTL
static int sched_rt_global_constraints(void)
@@ -2882,7 +2835,7 @@ static int sched_rt_global_constraints(void)
return 0;
}
#endif /* CONFIG_SYSCTL */
-#endif /* CONFIG_RT_GROUP_SCHED */
+#endif /* !CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_SYSCTL
static int sched_rt_global_validate(void)
@@ -2938,6 +2891,12 @@ undo:
sched_domains_mutex_unlock();
mutex_unlock(&mutex);
+ /*
+ * After changing maximum available bandwidth for DEADLINE, we need to
+ * recompute per root domain and per cpus variables accordingly.
+ */
+ rebuild_sched_domains();
+
return ret;
}
diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h
index c529706bed11..6803cfec7a1e 100644
--- a/kernel/sched/sched-pelt.h
+++ b/kernel/sched/sched-pelt.h
@@ -1,5 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Generated by Documentation/scheduler/sched-pelt; do not modify. */
+#include <linux/types.h>
static const u32 runnable_avg_yN_inv[] __maybe_unused = {
0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 47972f34ea70..be9745d104f7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -69,6 +69,7 @@
#include <linux/wait_bit.h>
#include <linux/workqueue_api.h>
#include <linux/delayacct.h>
+#include <linux/mmu_context.h>
#include <trace/events/power.h>
#include <trace/events/sched.h>
@@ -384,6 +385,7 @@ extern void dl_server_stop(struct sched_dl_entity *dl_se);
extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
dl_server_has_tasks_f has_tasks,
dl_server_pick_f pick_task);
+extern void sched_init_dl_servers(void);
extern void dl_server_update_idle_time(struct rq *rq,
struct task_struct *p);
@@ -401,6 +403,19 @@ static inline bool dl_server_active(struct sched_dl_entity *dl_se)
extern struct list_head task_groups;
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
+extern const u64 max_bw_quota_period_us;
+
+/*
+ * default period for group bandwidth.
+ * default: 0.1s, units: microseconds
+ */
+static inline u64 default_bw_period_us(void)
+{
+ return 100000ULL;
+}
+#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */
+
struct cfs_bandwidth {
#ifdef CONFIG_CFS_BANDWIDTH
raw_spinlock_t lock;
@@ -424,7 +439,7 @@ struct cfs_bandwidth {
int nr_burst;
u64 throttled_time;
u64 burst_time;
-#endif
+#endif /* CONFIG_CFS_BANDWIDTH */
};
/* Task group related information */
@@ -442,15 +457,13 @@ struct task_group {
/* runqueue "owned" by this group on each CPU */
struct cfs_rq **cfs_rq;
unsigned long shares;
-#ifdef CONFIG_SMP
/*
* load_avg can be heavily contended at clock tick time, so put
* it in its own cache-line separated from the fields above which
* will also be accessed at each tick.
*/
atomic_long_t load_avg ____cacheline_aligned;
-#endif
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity **rt_se;
@@ -459,10 +472,7 @@ struct task_group {
struct rt_bandwidth rt_bandwidth;
#endif
-#ifdef CONFIG_EXT_GROUP_SCHED
- u32 scx_flags; /* SCX_TG_* */
- u32 scx_weight;
-#endif
+ struct scx_task_group scx;
struct rcu_head rcu;
struct list_head list;
@@ -531,7 +541,7 @@ extern void free_fair_sched_group(struct task_group *tg);
extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
extern void online_fair_sched_group(struct task_group *tg);
extern void unregister_fair_sched_group(struct task_group *tg);
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static inline void free_fair_sched_group(struct task_group *tg) { }
static inline int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{
@@ -539,7 +549,7 @@ static inline int alloc_fair_sched_group(struct task_group *tg, struct task_grou
}
static inline void online_fair_sched_group(struct task_group *tg) { }
static inline void unregister_fair_sched_group(struct task_group *tg) { }
-#endif
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu,
@@ -573,25 +583,20 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
extern int sched_group_set_idle(struct task_group *tg, long idle);
-#ifdef CONFIG_SMP
extern void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next);
-#else /* !CONFIG_SMP */
-static inline void set_task_rq_fair(struct sched_entity *se,
- struct cfs_rq *prev, struct cfs_rq *next) { }
-#endif /* CONFIG_SMP */
-#else /* !CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; }
static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; }
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
-#else /* CONFIG_CGROUP_SCHED */
+#else /* !CONFIG_CGROUP_SCHED: */
struct cfs_bandwidth { };
static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; }
-#endif /* CONFIG_CGROUP_SCHED */
+#endif /* !CONFIG_CGROUP_SCHED */
extern void unregister_rt_sched_group(struct task_group *tg);
extern void free_rt_sched_group(struct task_group *tg);
@@ -667,7 +672,6 @@ struct cfs_rq {
struct sched_entity *curr;
struct sched_entity *next;
-#ifdef CONFIG_SMP
/*
* CFS load tracking
*/
@@ -699,7 +703,6 @@ struct cfs_rq {
u64 last_h_load_update;
struct sched_entity *h_load_next;
#endif /* CONFIG_FAIR_GROUP_SCHED */
-#endif /* CONFIG_SMP */
#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */
@@ -796,32 +799,28 @@ struct rt_rq {
struct rt_prio_array active;
unsigned int rt_nr_running;
unsigned int rr_nr_running;
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
struct {
int curr; /* highest queued rt task prio */
-#ifdef CONFIG_SMP
int next; /* next highest */
-#endif
} highest_prio;
-#endif
-#ifdef CONFIG_SMP
bool overloaded;
struct plist_head pushable_tasks;
-#endif /* CONFIG_SMP */
int rt_queued;
#ifdef CONFIG_RT_GROUP_SCHED
int rt_throttled;
- u64 rt_time;
- u64 rt_runtime;
+ u64 rt_time; /* consumed RT time, goes up in update_curr_rt */
+ u64 rt_runtime; /* allotted RT time, "slice" from rt_bandwidth, RT sharing/balancing */
/* Nests inside the rq lock: */
raw_spinlock_t rt_runtime_lock;
unsigned int rt_nr_boosted;
- struct rq *rq;
- struct task_group *tg;
+ struct rq *rq; /* this is always top-level rq, cache? */
+#endif
+#ifdef CONFIG_CGROUP_SCHED
+ struct task_group *tg; /* this tg has "this" rt_rq on given CPU for runnable entities */
#endif
};
@@ -837,7 +836,6 @@ struct dl_rq {
unsigned int dl_nr_running;
-#ifdef CONFIG_SMP
/*
* Deadline values of the currently executing and the
* earliest ready task on this rq. Caching these facilitates
@@ -857,9 +855,7 @@ struct dl_rq {
* of the leftmost (earliest deadline) element.
*/
struct rb_root_cached pushable_dl_tasks_root;
-#else
- struct dl_bw dl_bw;
-#endif
+
/*
* "Active utilization" for this runqueue: increased when a
* task wakes up (becomes TASK_RUNNING) and decreased when a
@@ -930,7 +926,6 @@ static inline long se_runnable(struct sched_entity *se)
#endif /* !CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_SMP
/*
* XXX we want to get rid of these helpers and use the full load resolution.
*/
@@ -1006,7 +1001,7 @@ struct root_domain {
/* These atomics are updated outside of a lock */
atomic_t rto_loop_next;
atomic_t rto_loop_start;
-#endif
+#endif /* HAVE_RT_PUSH_IPI */
/*
* The "RT overload" flag: it gets set if a CPU has more than
* one runnable RT task.
@@ -1041,7 +1036,6 @@ static inline void set_rd_overloaded(struct root_domain *rd, int status)
#ifdef HAVE_RT_PUSH_IPI
extern void rto_push_irq_work_func(struct irq_work *work);
#endif
-#endif /* CONFIG_SMP */
#ifdef CONFIG_UCLAMP_TASK
/*
@@ -1105,18 +1099,14 @@ struct rq {
unsigned int numa_migrate_on;
#endif
#ifdef CONFIG_NO_HZ_COMMON
-#ifdef CONFIG_SMP
unsigned long last_blocked_load_update_tick;
unsigned int has_blocked_load;
call_single_data_t nohz_csd;
-#endif /* CONFIG_SMP */
unsigned int nohz_tick_stopped;
atomic_t nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */
-#ifdef CONFIG_SMP
unsigned int ttwu_pending;
-#endif
u64 nr_switches;
#ifdef CONFIG_UCLAMP_TASK
@@ -1147,12 +1137,17 @@ struct rq {
* one CPU and if it got migrated afterwards it may decrease
* it on another CPU. Always updated under the runqueue lock:
*/
- unsigned int nr_uninterruptible;
+ unsigned long nr_uninterruptible;
+#ifdef CONFIG_SCHED_PROXY_EXEC
+ struct task_struct __rcu *donor; /* Scheduling context */
+ struct task_struct __rcu *curr; /* Execution context */
+#else
union {
struct task_struct __rcu *donor; /* Scheduler context */
struct task_struct __rcu *curr; /* Execution context */
};
+#endif
struct sched_dl_entity *dl_server;
struct task_struct *idle;
struct task_struct *stop;
@@ -1181,7 +1176,6 @@ struct rq {
int membarrier_state;
#endif
-#ifdef CONFIG_SMP
struct root_domain *rd;
struct sched_domain __rcu *sd;
@@ -1222,7 +1216,6 @@ struct rq {
#ifdef CONFIG_HOTPLUG_CPU
struct rcuwait hotplug_wait;
#endif
-#endif /* CONFIG_SMP */
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
@@ -1240,9 +1233,7 @@ struct rq {
long calc_load_active;
#ifdef CONFIG_SCHED_HRTICK
-#ifdef CONFIG_SMP
call_single_data_t hrtick_csd;
-#endif
struct hrtimer hrtick_timer;
ktime_t hrtick_time;
#endif
@@ -1269,9 +1260,7 @@ struct rq {
struct cpuidle_state *idle_state;
#endif
-#ifdef CONFIG_SMP
unsigned int nr_pinned;
-#endif
unsigned int push_busy;
struct cpu_stop_work push_work;
@@ -1292,12 +1281,12 @@ struct rq {
unsigned int core_forceidle_seq;
unsigned int core_forceidle_occupation;
u64 core_forceidle_start;
-#endif
+#endif /* CONFIG_SCHED_CORE */
/* Scratch cpumask to be temporarily used under rq_lock */
cpumask_var_t scratch_mask;
-#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP)
+#ifdef CONFIG_CFS_BANDWIDTH
call_single_data_t cfsb_csd;
struct list_head cfsb_csd_list;
#endif
@@ -1311,32 +1300,24 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
return cfs_rq->rq;
}
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
{
return container_of(cfs_rq, struct rq, cfs);
}
-#endif
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
static inline int cpu_of(struct rq *rq)
{
-#ifdef CONFIG_SMP
return rq->cpu;
-#else
- return 0;
-#endif
}
#define MDF_PUSH 0x01
static inline bool is_migration_disabled(struct task_struct *p)
{
-#ifdef CONFIG_SMP
return p->migration_disabled;
-#else
- return false;
-#endif
}
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -1347,10 +1328,17 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() raw_cpu_ptr(&runqueues)
+#ifdef CONFIG_SCHED_PROXY_EXEC
+static inline void rq_set_donor(struct rq *rq, struct task_struct *t)
+{
+ rcu_assign_pointer(rq->donor, t);
+}
+#else
static inline void rq_set_donor(struct rq *rq, struct task_struct *t)
{
/* Do nothing */
}
+#endif
#ifdef CONFIG_SCHED_CORE
static inline struct cpumask *sched_group_span(struct sched_group *sg);
@@ -1499,6 +1487,24 @@ static inline bool sched_group_cookie_match(struct rq *rq,
#endif /* !CONFIG_SCHED_CORE */
+#ifdef CONFIG_RT_GROUP_SCHED
+# ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED
+DECLARE_STATIC_KEY_FALSE(rt_group_sched);
+static inline bool rt_group_sched_enabled(void)
+{
+ return static_branch_unlikely(&rt_group_sched);
+}
+# else /* !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED: */
+DECLARE_STATIC_KEY_TRUE(rt_group_sched);
+static inline bool rt_group_sched_enabled(void)
+{
+ return static_branch_likely(&rt_group_sched);
+}
+# endif /* !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */
+#else /* !CONFIG_RT_GROUP_SCHED: */
+# define rt_group_sched_enabled() false
+#endif /* !CONFIG_RT_GROUP_SCHED */
+
static inline void lockdep_assert_rq_held(struct rq *rq)
{
lockdep_assert_held(__rq_lockp(rq));
@@ -1555,9 +1561,9 @@ static inline void update_idle_core(struct rq *rq)
__update_idle_core(rq);
}
-#else
+#else /* !CONFIG_SCHED_SMT: */
static inline void update_idle_core(struct rq *rq) { }
-#endif
+#endif /* !CONFIG_SCHED_SMT */
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1717,10 +1723,10 @@ extern struct balance_callback balance_push_callback;
#ifdef CONFIG_SCHED_CLASS_EXT
extern const struct sched_class ext_sched_class;
-DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); /* SCX BPF scheduler loaded */
+DECLARE_STATIC_KEY_FALSE(__scx_enabled); /* SCX BPF scheduler loaded */
DECLARE_STATIC_KEY_FALSE(__scx_switched_all); /* all fair class tasks on SCX */
-#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled)
+#define scx_enabled() static_branch_unlikely(&__scx_enabled)
#define scx_switched_all() static_branch_unlikely(&__scx_switched_all)
static inline void scx_rq_clock_update(struct rq *rq, u64 clock)
@@ -1738,7 +1744,7 @@ static inline void scx_rq_clock_invalidate(struct rq *rq)
WRITE_ONCE(rq->scx.flags, rq->scx.flags & ~SCX_RQ_CLK_VALID);
}
-#else /* !CONFIG_SCHED_CLASS_EXT */
+#else /* !CONFIG_SCHED_CLASS_EXT: */
#define scx_enabled() false
#define scx_switched_all() false
@@ -1762,9 +1768,7 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
rf->clock_update_flags = 0;
-#ifdef CONFIG_SMP
WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback);
-#endif
}
static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
@@ -1942,8 +1946,6 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
#endif /* !CONFIG_NUMA_BALANCING */
-#ifdef CONFIG_SMP
-
static inline void
queue_balance_callback(struct rq *rq,
struct balance_callback *head,
@@ -2109,8 +2111,6 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p)
return p->user_cpus_ptr;
}
-#endif /* CONFIG_SMP */
-
#ifdef CONFIG_CGROUP_SCHED
/*
@@ -2146,9 +2146,16 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
#endif
#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * p->rt.rt_rq is NULL initially and it is easier to assign
+ * root_task_group's rt_rq than switching in rt_rq_of_se()
+ * Clobbers tg(!)
+ */
+ if (!rt_group_sched_enabled())
+ tg = &root_task_group;
p->rt.rt_rq = tg->rt_rq[cpu];
p->rt.parent = tg->rt_se[cpu];
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
}
#else /* !CONFIG_CGROUP_SCHED: */
@@ -2174,7 +2181,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
smp_wmb();
WRITE_ONCE(task_thread_info(p)->cpu, cpu);
p->wake_cpu = cpu;
-#endif
+#endif /* CONFIG_SMP */
}
/*
@@ -2252,13 +2259,17 @@ static inline int task_current_donor(struct rq *rq, struct task_struct *p)
return rq->donor == p;
}
+static inline bool task_is_blocked(struct task_struct *p)
+{
+ if (!sched_proxy_exec())
+ return false;
+
+ return !!p->blocked_on;
+}
+
static inline int task_on_cpu(struct rq *rq, struct task_struct *p)
{
-#ifdef CONFIG_SMP
return p->on_cpu;
-#else
- return task_current(rq, p);
-#endif
}
static inline int task_on_rq_queued(struct task_struct *p)
@@ -2281,11 +2292,9 @@ static inline int task_on_rq_migrating(struct task_struct *p)
#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */
#define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */
-#ifdef CONFIG_SMP
static_assert(WF_EXEC == SD_BALANCE_EXEC);
static_assert(WF_FORK == SD_BALANCE_FORK);
static_assert(WF_TTWU == SD_BALANCE_WAKE);
-#endif
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -2341,11 +2350,7 @@ extern const u32 sched_prio_to_wmult[40];
#define ENQUEUE_HEAD 0x10
#define ENQUEUE_REPLENISH 0x20
-#ifdef CONFIG_SMP
#define ENQUEUE_MIGRATED 0x40
-#else
-#define ENQUEUE_MIGRATED 0x00
-#endif
#define ENQUEUE_INITIAL 0x80
#define ENQUEUE_MIGRATING 0x100
#define ENQUEUE_DELAYED 0x200
@@ -2390,7 +2395,6 @@ struct sched_class {
void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next);
void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
-#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
@@ -2403,7 +2407,6 @@ struct sched_class {
void (*rq_offline)(struct rq *rq);
struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
-#endif
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
void (*task_fork)(struct task_struct *p);
@@ -2461,7 +2464,7 @@ static inline void put_prev_set_next_task(struct rq *rq,
struct task_struct *prev,
struct task_struct *next)
{
- WARN_ON_ONCE(rq->curr != prev);
+ WARN_ON_ONCE(rq->donor != prev);
__put_prev_set_next_dl_server(rq, prev, next);
@@ -2555,8 +2558,6 @@ extern struct task_struct *pick_task_idle(struct rq *rq);
#define SCA_MIGRATE_ENABLE 0x04
#define SCA_USER 0x08
-#ifdef CONFIG_SMP
-
extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void sched_balance_trigger(struct rq *rq);
@@ -2608,26 +2609,6 @@ static inline struct task_struct *get_push_task(struct rq *rq)
extern int push_cpu_stop(void *arg);
-#else /* !CONFIG_SMP: */
-
-static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu)
-{
- return true;
-}
-
-static inline int __set_cpus_allowed_ptr(struct task_struct *p,
- struct affinity_context *ctx)
-{
- return set_cpus_allowed_ptr(p, ctx->new_mask);
-}
-
-static inline cpumask_t *alloc_user_cpus_ptr(int node)
-{
- return NULL;
-}
-
-#endif /* !CONFIG_SMP */
-
#ifdef CONFIG_CPU_IDLE
static inline void idle_set_state(struct rq *rq,
@@ -2723,10 +2704,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
call_trace_sched_update_nr_running(rq, count);
}
-#ifdef CONFIG_SMP
if (prev_nr < 2 && rq->nr_running >= 2)
set_rd_overloaded(rq->rd, 1);
-#endif
sched_update_tick_dependency(rq);
}
@@ -2892,10 +2871,7 @@ unsigned long arch_scale_freq_capacity(int cpu)
static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
{
rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
- /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */
-#ifdef CONFIG_SMP
rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
-#endif
}
#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \
@@ -2904,8 +2880,6 @@ static inline class_##name##_t class_##name##_constructor(type *lock, type *lock
{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \
_lock; return _t; }
-#ifdef CONFIG_SMP
-
static inline bool rq_order_less(struct rq *rq1, struct rq *rq2)
{
#ifdef CONFIG_SCHED_CORE
@@ -2928,7 +2902,7 @@ static inline bool rq_order_less(struct rq *rq1, struct rq *rq2)
/*
* __sched_core_flip() relies on SMT having cpu-id lock order.
*/
-#endif
+#endif /* CONFIG_SCHED_CORE */
return rq1->cpu < rq2->cpu;
}
@@ -3065,42 +3039,6 @@ extern void set_rq_offline(struct rq *rq);
extern bool sched_smp_initialized;
-#else /* !CONFIG_SMP: */
-
-/*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
- __acquires(rq1->lock)
- __acquires(rq2->lock)
-{
- WARN_ON_ONCE(!irqs_disabled());
- WARN_ON_ONCE(rq1 != rq2);
- raw_spin_rq_lock(rq1);
- __acquire(rq2->lock); /* Fake it out ;) */
- double_rq_clock_clear_update(rq1, rq2);
-}
-
-/*
- * double_rq_unlock - safely unlock two runqueues
- *
- * Note this does not restore interrupts like task_rq_unlock,
- * you need to do so manually after calling.
- */
-static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
- __releases(rq1->lock)
- __releases(rq2->lock)
-{
- WARN_ON_ONCE(rq1 != rq2);
- raw_spin_rq_unlock(rq1);
- __release(rq2->lock);
-}
-
-#endif /* !CONFIG_SMP */
-
DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq,
double_rq_lock(_T->lock, _T->lock2),
double_rq_unlock(_T->lock, _T->lock2))
@@ -3119,6 +3057,7 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
extern void resched_latency_warn(int cpu, u64 latency);
+
#ifdef CONFIG_NUMA_BALANCING
extern void show_numa_stats(struct task_struct *p, struct seq_file *m);
extern void
@@ -3158,7 +3097,7 @@ extern void nohz_balance_exit_idle(struct rq *rq);
static inline void nohz_balance_exit_idle(struct rq *rq) { }
#endif /* !CONFIG_NO_HZ_COMMON */
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+#ifdef CONFIG_NO_HZ_COMMON
extern void nohz_run_idle_balance(int cpu);
#else
static inline void nohz_run_idle_balance(int cpu) { }
@@ -3228,14 +3167,14 @@ static inline u64 irq_time_read(int cpu)
return total;
}
-#else
+#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */
static inline int irqtime_enabled(void)
{
return 0;
}
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
#ifdef CONFIG_CPU_FREQ
@@ -3284,8 +3223,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { }
# define arch_scale_freq_invariant() false
#endif
-#ifdef CONFIG_SMP
-
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
unsigned long *min,
unsigned long *max);
@@ -3329,10 +3266,6 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
return READ_ONCE(rq->avg_rt.util_avg);
}
-#else /* !CONFIG_SMP */
-static inline bool update_other_load_avgs(struct rq *rq) { return false; }
-#endif /* CONFIG_SMP */
-
#ifdef CONFIG_UCLAMP_TASK
unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
@@ -3509,15 +3442,13 @@ static inline bool sched_energy_enabled(void)
return static_branch_unlikely(&sched_energy_present);
}
-extern struct cpufreq_governor schedutil_gov;
-
-#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
+#else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */
#define perf_domain_span(pd) NULL
static inline bool sched_energy_enabled(void) { return false; }
-#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
+#endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
#ifdef CONFIG_MEMBARRIER
@@ -3543,7 +3474,7 @@ static inline void membarrier_switch_mm(struct rq *rq,
WRITE_ONCE(rq->membarrier_state, membarrier_state);
}
-#else /* !CONFIG_MEMBARRIER :*/
+#else /* !CONFIG_MEMBARRIER: */
static inline void membarrier_switch_mm(struct rq *rq,
struct mm_struct *prev_mm,
@@ -3553,7 +3484,6 @@ static inline void membarrier_switch_mm(struct rq *rq,
#endif /* !CONFIG_MEMBARRIER */
-#ifdef CONFIG_SMP
static inline bool is_per_cpu_kthread(struct task_struct *p)
{
if (!(p->flags & PF_KTHREAD))
@@ -3564,7 +3494,6 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
return true;
}
-#endif
extern void swake_up_all_locked(struct swait_queue_head *q);
extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
@@ -3863,7 +3792,6 @@ static inline void init_sched_mm_cid(struct task_struct *t) { }
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
-#ifdef CONFIG_SMP
static inline
void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task)
{
@@ -3884,7 +3812,6 @@ bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu)
return false;
}
-#endif
#ifdef CONFIG_RT_MUTEXES
@@ -3925,21 +3852,8 @@ extern void check_class_changed(struct rq *rq, struct task_struct *p,
const struct sched_class *prev_class,
int oldprio);
-#ifdef CONFIG_SMP
extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
-#else
-
-static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
-{
- return NULL;
-}
-
-static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
-{
-}
-
-#endif
#ifdef CONFIG_SCHED_CLASS_EXT
/*
diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h
index 21ac44428bb0..7f151d96dba9 100644
--- a/kernel/sched/smp.h
+++ b/kernel/sched/smp.h
@@ -1,8 +1,13 @@
/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _KERNEL_SCHED_SMP_H
+#define _KERNEL_SCHED_SMP_H
+
/*
* Scheduler internal SMP callback types and methods between the scheduler
* and other internal parts of the core kernel:
*/
+#include <linux/types.h>
extern void sched_ttwu_pending(void *arg);
@@ -13,3 +18,5 @@ extern void flush_smp_call_function_queue(void);
#else
static inline void flush_smp_call_function_queue(void) { }
#endif
+
+#endif /* _KERNEL_SCHED_SMP_H */
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 4346fd81c31f..d1c9429a4ac5 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -2,6 +2,7 @@
/*
* /proc/schedstat implementation
*/
+#include "sched.h"
void __update_stats_wait_start(struct rq *rq, struct task_struct *p,
struct sched_statistics *stats)
@@ -114,10 +115,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
seq_printf(seq, "timestamp %lu\n", jiffies);
} else {
struct rq *rq;
-#ifdef CONFIG_SMP
struct sched_domain *sd;
int dcount = 0;
-#endif
cpu = (unsigned long)(v - 2);
rq = cpu_rq(cpu);
@@ -132,7 +131,6 @@ static int show_schedstat(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
-#ifdef CONFIG_SMP
/* domain-specific stats */
rcu_read_lock();
for_each_domain(cpu, sd) {
@@ -163,7 +161,6 @@ static int show_schedstat(struct seq_file *seq, void *v)
sd->ttwu_move_balance);
}
rcu_read_unlock();
-#endif
}
return 0;
}
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 452826df6ae1..26f3fd4d34ce 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -112,10 +112,10 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev);
-#else
+#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */
static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
struct task_struct *prev) {}
-#endif /*CONFIG_IRQ_TIME_ACCOUNTING */
+#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
/*
* PSI tracks state that persists across sleeps, such as iowaits and
* memory stalls. As a result, it has to distinguish between sleeps,
@@ -220,7 +220,7 @@ static inline void psi_sched_switch(struct task_struct *prev,
psi_task_switch(prev, next, sleep);
}
-#else /* CONFIG_PSI */
+#else /* !CONFIG_PSI: */
static inline void psi_enqueue(struct task_struct *p, bool migrate) {}
static inline void psi_dequeue(struct task_struct *p, bool migrate) {}
static inline void psi_ttwu_dequeue(struct task_struct *p) {}
@@ -229,7 +229,7 @@ static inline void psi_sched_switch(struct task_struct *prev,
bool sleep) {}
static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
struct task_struct *prev) {}
-#endif /* CONFIG_PSI */
+#endif /* !CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
/*
@@ -334,6 +334,6 @@ sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *n
# define sched_info_enqueue(rq, t) do { } while (0)
# define sched_info_dequeue(rq, t) do { } while (0)
# define sched_info_switch(rq, t, next) do { } while (0)
-#endif /* CONFIG_SCHED_INFO */
+#endif /* !CONFIG_SCHED_INFO */
#endif /* _KERNEL_STATS_H */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 058dd42e3d9b..2d4e279f05ee 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -7,8 +7,8 @@
*
* See kernel/stop_machine.c
*/
+#include "sched.h"
-#ifdef CONFIG_SMP
static int
select_task_rq_stop(struct task_struct *p, int cpu, int flags)
{
@@ -20,7 +20,6 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
return sched_stop_runnable(rq);
}
-#endif /* CONFIG_SMP */
static void
wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags)
@@ -106,11 +105,9 @@ DEFINE_SCHED_CLASS(stop) = {
.put_prev_task = put_prev_task_stop,
.set_next_task = set_next_task_stop,
-#ifdef CONFIG_SMP
.balance = balance_stop,
.select_task_rq = select_task_rq_stop,
.set_cpus_allowed = set_cpus_allowed_common,
-#endif
.task_tick = task_tick_stop,
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 72505cd3b60a..0fef6496c4c8 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -2,6 +2,7 @@
/*
* <linux/swait.h> (simple wait queues ) implementation:
*/
+#include "sched.h"
void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
struct lock_class_key *key)
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index c326de1344fb..77ae87f36e84 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -174,7 +174,7 @@ SYSCALL_DEFINE1(nice, int, increment)
return 0;
}
-#endif
+#endif /* __ARCH_WANT_SYS_NICE */
/**
* task_prio - return the priority value of a given task.
@@ -209,10 +209,8 @@ int idle_cpu(int cpu)
if (rq->nr_running)
return 0;
-#ifdef CONFIG_SMP
if (rq->ttwu_pending)
return 0;
-#endif
return 1;
}
@@ -255,8 +253,7 @@ int sched_core_idle_cpu(int cpu)
return idle_cpu(cpu);
}
-
-#endif
+#endif /* CONFIG_SCHED_CORE */
/**
* find_process_by_pid - find a process with a matching PID value.
@@ -448,7 +445,7 @@ static inline int uclamp_validate(struct task_struct *p,
}
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr) { }
-#endif
+#endif /* !CONFIG_UCLAMP_TASK */
/*
* Allow unprivileged RT tasks to decrease priority.
@@ -634,14 +631,14 @@ change:
* Do not allow real-time tasks into groups that have no runtime
* assigned.
*/
- if (rt_bandwidth_enabled() && rt_policy(policy) &&
+ if (rt_group_sched_enabled() &&
+ rt_bandwidth_enabled() && rt_policy(policy) &&
task_group(p)->rt_bandwidth.rt_runtime == 0 &&
!task_group_is_autogroup(task_group(p))) {
retval = -EPERM;
goto unlock;
}
-#endif
-#ifdef CONFIG_SMP
+#endif /* CONFIG_RT_GROUP_SCHED */
if (dl_bandwidth_enabled() && dl_policy(policy) &&
!(attr->sched_flags & SCHED_FLAG_SUGOV)) {
cpumask_t *span = rq->rd->span;
@@ -657,7 +654,6 @@ change:
goto unlock;
}
}
-#endif
}
/* Re-check policy now with rq lock held: */
@@ -1119,7 +1115,6 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL);
}
-#ifdef CONFIG_SMP
int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
{
/*
@@ -1148,7 +1143,6 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
return 0;
}
-#endif /* CONFIG_SMP */
int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
{
@@ -1241,7 +1235,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
if (user_mask) {
cpumask_copy(user_mask, in_mask);
- } else if (IS_ENABLED(CONFIG_SMP)) {
+ } else {
return -ENOMEM;
}
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f1ebc60d967f..977e133bb8a4 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -3,7 +3,9 @@
* Scheduler topology setup/handling methods
*/
+#include <linux/sched/isolation.h>
#include <linux/bsearch.h>
+#include "sched.h"
DEFINE_MUTEX(sched_domains_mutex);
void sched_domains_mutex_lock(void)
@@ -87,7 +89,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (!(sd->flags & SD_OVERLAP) &&
+ if (!(sd->flags & SD_NUMA) &&
cpumask_intersects(groupmask, sched_group_span(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: repeated CPUs\n");
@@ -100,7 +102,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
group->sgc->id,
cpumask_pr_args(sched_group_span(group)));
- if ((sd->flags & SD_OVERLAP) &&
+ if ((sd->flags & SD_NUMA) &&
!cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
printk(KERN_CONT " mask=%*pbl",
cpumask_pr_args(group_balance_mask(group)));
@@ -212,8 +214,6 @@ static bool sched_energy_update;
static bool sched_is_eas_possible(const struct cpumask *cpu_mask)
{
bool any_asym_capacity = false;
- struct cpufreq_policy *policy;
- struct cpufreq_governor *gov;
int i;
/* EAS is enabled for asymmetric CPU capacity topologies. */
@@ -248,25 +248,12 @@ static bool sched_is_eas_possible(const struct cpumask *cpu_mask)
return false;
}
- /* Do not attempt EAS if schedutil is not being used. */
- for_each_cpu(i, cpu_mask) {
- policy = cpufreq_cpu_get(i);
- if (!policy) {
- if (sched_debug()) {
- pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d",
- cpumask_pr_args(cpu_mask), i);
- }
- return false;
- }
- gov = policy->governor;
- cpufreq_cpu_put(policy);
- if (gov != &schedutil_gov) {
- if (sched_debug()) {
- pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n",
- cpumask_pr_args(cpu_mask));
- }
- return false;
+ if (!cpufreq_ready_for_eas(cpu_mask)) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS: cpufreq is not ready\n",
+ cpumask_pr_args(cpu_mask));
}
+ return false;
}
return true;
@@ -328,7 +315,7 @@ static int __init sched_energy_aware_sysctl_init(void)
}
late_initcall(sched_energy_aware_sysctl_init);
-#endif
+#endif /* CONFIG_PROC_SYSCTL */
static void free_pd(struct perf_domain *pd)
{
@@ -464,9 +451,9 @@ free:
return false;
}
-#else
+#else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */
static void free_pd(struct perf_domain *pd) { }
-#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
+#endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
static void free_rootdomain(struct rcu_head *rcu)
{
@@ -1333,6 +1320,60 @@ next:
update_group_capacity(sd, cpu);
}
+/* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */
+void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
+{
+ int asym_prefer_cpu = cpu;
+ struct sched_domain *sd;
+
+ guard(rcu)();
+
+ for_each_domain(cpu, sd) {
+ struct sched_group *sg;
+ int group_cpu;
+
+ if (!(sd->flags & SD_ASYM_PACKING))
+ continue;
+
+ /*
+ * Groups of overlapping domain are replicated per NUMA
+ * node and will require updating "asym_prefer_cpu" on
+ * each local copy.
+ *
+ * If you are hitting this warning, consider moving
+ * "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu"
+ * which is shared by all the overlapping groups.
+ */
+ WARN_ON_ONCE(sd->flags & SD_NUMA);
+
+ sg = sd->groups;
+ if (cpu != sg->asym_prefer_cpu) {
+ /*
+ * Since the parent is a superset of the current group,
+ * if the cpu is not the "asym_prefer_cpu" at the
+ * current level, it cannot be the preferred CPU at a
+ * higher levels either.
+ */
+ if (!sched_asym_prefer(cpu, sg->asym_prefer_cpu))
+ return;
+
+ WRITE_ONCE(sg->asym_prefer_cpu, cpu);
+ continue;
+ }
+
+ /* Ranking has improved; CPU is still the preferred one. */
+ if (new_prio >= old_prio)
+ continue;
+
+ for_each_cpu(group_cpu, sched_group_span(sg)) {
+ if (sched_asym_prefer(group_cpu, asym_prefer_cpu))
+ asym_prefer_cpu = group_cpu;
+ }
+
+ WRITE_ONCE(sg->asym_prefer_cpu, asym_prefer_cpu);
+ }
+}
+
/*
* Set of available CPUs grouped by their corresponding capacities
* Each list entry contains a CPU mask reflecting CPUs that share the same
@@ -1555,7 +1596,7 @@ static int sched_domains_curr_level;
int sched_max_numa_distance;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
-#endif
+#endif /* CONFIG_NUMA */
/*
* SD_flags allowed in topology descriptions.
@@ -1671,7 +1712,7 @@ sd_init(struct sched_domain_topology_level *tl,
SD_WAKE_AFFINE);
}
-#endif
+#endif /* CONFIG_NUMA */
} else {
sd->cache_nice_tries = 1;
}
@@ -1696,17 +1737,17 @@ sd_init(struct sched_domain_topology_level *tl,
*/
static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
- { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+ SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT),
#endif
#ifdef CONFIG_SCHED_CLUSTER
- { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) },
+ SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS),
#endif
#ifdef CONFIG_SCHED_MC
- { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+ SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC),
#endif
- { cpu_cpu_mask, SD_INIT_NAME(PKG) },
+ SDTL_INIT(cpu_cpu_mask, NULL, PKG),
{ NULL, },
};
@@ -1967,23 +2008,14 @@ void sched_init_numa(int offline_node)
/*
* Add the NUMA identity distance, aka single NODE.
*/
- tl[i++] = (struct sched_domain_topology_level){
- .mask = sd_numa_mask,
- .numa_level = 0,
- SD_INIT_NAME(NODE)
- };
+ tl[i++] = SDTL_INIT(sd_numa_mask, NULL, NODE);
/*
* .. and append 'j' levels of NUMA goodness.
*/
for (j = 1; j < nr_levels; i++, j++) {
- tl[i] = (struct sched_domain_topology_level){
- .mask = sd_numa_mask,
- .sd_flags = cpu_numa_flags,
- .flags = SDTL_OVERLAP,
- .numa_level = j,
- SD_INIT_NAME(NUMA)
- };
+ tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA);
+ tl[i].numa_level = j;
}
sched_domain_topology_saved = sched_domain_topology;
@@ -2098,7 +2130,7 @@ int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
for (i = 0; i < sched_domains_numa_levels; i++) {
if (!masks[i][j])
break;
- cpu = cpumask_any_and(cpus, masks[i][j]);
+ cpu = cpumask_any_and_distribute(cpus, masks[i][j]);
if (cpu < nr_cpu_ids) {
found = cpu;
break;
@@ -2294,7 +2326,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
if (sdd->sd) {
sd = *per_cpu_ptr(sdd->sd, j);
- if (sd && (sd->flags & SD_OVERLAP))
+ if (sd && (sd->flags & SD_NUMA))
free_sched_groups(sd->groups, 0);
kfree(*per_cpu_ptr(sdd->sd, j));
}
@@ -2347,35 +2379,58 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
/*
* Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
- * any two given CPUs at this (non-NUMA) topology level.
+ * any two given CPUs on non-NUMA topology levels.
*/
-static bool topology_span_sane(struct sched_domain_topology_level *tl,
- const struct cpumask *cpu_map, int cpu)
+static bool topology_span_sane(const struct cpumask *cpu_map)
{
- int i = cpu + 1;
+ struct sched_domain_topology_level *tl;
+ struct cpumask *covered, *id_seen;
+ int cpu;
- /* NUMA levels are allowed to overlap */
- if (tl->flags & SDTL_OVERLAP)
- return true;
+ lockdep_assert_held(&sched_domains_mutex);
+ covered = sched_domains_tmpmask;
+ id_seen = sched_domains_tmpmask2;
+
+ for_each_sd_topology(tl) {
+ int tl_common_flags = 0;
+
+ if (tl->sd_flags)
+ tl_common_flags = (*tl->sd_flags)();
+
+ /* NUMA levels are allowed to overlap */
+ if (tl_common_flags & SD_NUMA)
+ continue;
+
+ cpumask_clear(covered);
+ cpumask_clear(id_seen);
- /*
- * Non-NUMA levels cannot partially overlap - they must be either
- * completely equal or completely disjoint. Otherwise we can end up
- * breaking the sched_group lists - i.e. a later get_group() pass
- * breaks the linking done for an earlier span.
- */
- for_each_cpu_from(i, cpu_map) {
/*
- * We should 'and' all those masks with 'cpu_map' to exactly
- * match the topology we're about to build, but that can only
- * remove CPUs, which only lessens our ability to detect
- * overlaps
+ * Non-NUMA levels cannot partially overlap - they must be either
+ * completely equal or completely disjoint. Otherwise we can end up
+ * breaking the sched_group lists - i.e. a later get_group() pass
+ * breaks the linking done for an earlier span.
*/
- if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
- cpumask_intersects(tl->mask(cpu), tl->mask(i)))
- return false;
- }
+ for_each_cpu(cpu, cpu_map) {
+ const struct cpumask *tl_cpu_mask = tl->mask(cpu);
+ int id;
+ /* lowest bit set in this mask is used as a unique id */
+ id = cpumask_first(tl_cpu_mask);
+
+ if (cpumask_test_cpu(id, id_seen)) {
+ /* First CPU has already been seen, ensure identical spans */
+ if (!cpumask_equal(tl->mask(id), tl_cpu_mask))
+ return false;
+ } else {
+ /* First CPU hasn't been seen before, ensure it's a completely new span */
+ if (cpumask_intersects(tl_cpu_mask, covered))
+ return false;
+
+ cpumask_or(covered, covered, tl_cpu_mask);
+ cpumask_set_cpu(id, id_seen);
+ }
+ }
+ }
return true;
}
@@ -2408,27 +2463,25 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
sd = NULL;
for_each_sd_topology(tl) {
- if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
- goto error;
-
sd = build_sched_domain(tl, cpu_map, attr, sd, i);
has_asym |= sd->flags & SD_ASYM_CPUCAPACITY;
if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd;
- if (tl->flags & SDTL_OVERLAP)
- sd->flags |= SD_OVERLAP;
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
break;
}
}
+ if (WARN_ON(!topology_span_sane(cpu_map)))
+ goto error;
+
/* Build the groups for the domains */
for_each_cpu(i, cpu_map) {
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
sd->span_weight = cpumask_weight(sched_domain_span(sd));
- if (sd->flags & SD_OVERLAP) {
+ if (sd->flags & SD_NUMA) {
if (build_overlap_sched_groups(sd, i))
goto error;
} else {
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 51e38f5f4701..20f27e2cf7ae 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -4,6 +4,7 @@
*
* (C) 2004 Nadia Yvette Chambers, Oracle
*/
+#include "sched.h"
void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key)
{
@@ -40,13 +41,31 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_
{
unsigned long flags;
- wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
+ wq_entry->flags |= WQ_FLAG_PRIORITY;
spin_lock_irqsave(&wq_head->lock, flags);
__add_wait_queue(wq_head, wq_entry);
spin_unlock_irqrestore(&wq_head->lock, flags);
}
EXPORT_SYMBOL_GPL(add_wait_queue_priority);
+int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head,
+ struct wait_queue_entry *wq_entry)
+{
+ struct list_head *head = &wq_head->head;
+
+ wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
+
+ guard(spinlock_irqsave)(&wq_head->lock);
+
+ if (!list_empty(head) &&
+ (list_first_entry(head, typeof(*wq_entry), entry)->flags & WQ_FLAG_PRIORITY))
+ return -EBUSY;
+
+ list_add(&wq_entry->entry, head);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(add_wait_queue_priority_exclusive);
+
void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
unsigned long flags;
@@ -64,7 +83,7 @@ EXPORT_SYMBOL(remove_wait_queue);
* the non-exclusive tasks. Normally, exclusive tasks will be at the end of
* the list and any non-exclusive tasks will be woken first. A priority task
* may be at the head of the list, and can consume the event without any other
- * tasks being woken.
+ * tasks being woken if it's also an exclusive task.
*
* There are circumstances in which we can try to wake a task which has already
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index b410b61cec95..1088d3b7012c 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -1,5 +1,8 @@
// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/sched/debug.h>
+#include "sched.h"
+
/*
* The implementation of the wait_bit*() and related waiting APIs:
*/