summaryrefslogtreecommitdiff
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c893
1 files changed, 529 insertions, 364 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dce50fa57471..3ec00d08d46a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -69,8 +69,8 @@
#include <linux/livepatch_sched.h>
#ifdef CONFIG_PREEMPT_DYNAMIC
-# ifdef CONFIG_GENERIC_ENTRY
-# include <linux/entry-common.h>
+# ifdef CONFIG_GENERIC_IRQ_ENTRY
+# include <linux/irq-entry-common.h>
# endif
#endif
@@ -96,6 +96,7 @@
#include "../workqueue_internal.h"
#include "../../io_uring/io-wq.h"
#include "../smpboot.h"
+#include "../locking/mutex.h"
EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu);
EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask);
@@ -119,6 +120,35 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#ifdef CONFIG_SCHED_PROXY_EXEC
+DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
+static int __init setup_proxy_exec(char *str)
+{
+ bool proxy_enable = true;
+
+ if (*str && kstrtobool(str + 1, &proxy_enable)) {
+ pr_warn("Unable to parse sched_proxy_exec=\n");
+ return 0;
+ }
+
+ if (proxy_enable) {
+ pr_info("sched_proxy_exec enabled via boot arg\n");
+ static_branch_enable(&__sched_proxy_exec);
+ } else {
+ pr_info("sched_proxy_exec disabled via boot arg\n");
+ static_branch_disable(&__sched_proxy_exec);
+ }
+ return 1;
+}
+#else
+static int __init setup_proxy_exec(char *str)
+{
+ pr_warn("CONFIG_SCHED_PROXY_EXEC=n, so it cannot be enabled or disabled at boot time\n");
+ return 0;
+}
+#endif
+__setup("sched_proxy_exec", setup_proxy_exec);
+
/*
* Debugging: various feature bits
*
@@ -481,13 +511,13 @@ void sched_core_put(void)
schedule_work(&_work);
}
-#else /* !CONFIG_SCHED_CORE */
+#else /* !CONFIG_SCHED_CORE: */
static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
static inline void
sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
-#endif /* CONFIG_SCHED_CORE */
+#endif /* !CONFIG_SCHED_CORE */
/* need a wrapper since we may need to trace from modules */
EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp);
@@ -650,7 +680,6 @@ void raw_spin_rq_unlock(struct rq *rq)
raw_spin_unlock(rq_lockp(rq));
}
-#ifdef CONFIG_SMP
/*
* double_rq_lock - safely lock two runqueues
*/
@@ -667,7 +696,6 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2)
double_rq_clock_clear_update(rq1, rq2);
}
-#endif
/*
* __task_rq_lock - lock the rq @p resides on.
@@ -853,8 +881,6 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-#ifdef CONFIG_SMP
-
static void __hrtick_restart(struct rq *rq)
{
struct hrtimer *timer = &rq->hrtick_timer;
@@ -899,33 +925,12 @@ void hrtick_start(struct rq *rq, u64 delay)
smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
}
-#else
-/*
- * Called to set the hrtick timer state.
- *
- * called with rq->lock held and IRQs disabled
- */
-void hrtick_start(struct rq *rq, u64 delay)
-{
- /*
- * Don't schedule slices shorter than 10000ns, that just
- * doesn't make sense. Rely on vruntime for fairness.
- */
- delay = max_t(u64, delay, 10000LL);
- hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
- HRTIMER_MODE_REL_PINNED_HARD);
-}
-
-#endif /* CONFIG_SMP */
-
static void hrtick_rq_init(struct rq *rq)
{
-#ifdef CONFIG_SMP
INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
-#endif
hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
}
-#else /* CONFIG_SCHED_HRTICK */
+#else /* !CONFIG_SCHED_HRTICK: */
static inline void hrtick_clear(struct rq *rq)
{
}
@@ -933,7 +938,7 @@ static inline void hrtick_clear(struct rq *rq)
static inline void hrtick_rq_init(struct rq *rq)
{
}
-#endif /* CONFIG_SCHED_HRTICK */
+#endif /* !CONFIG_SCHED_HRTICK */
/*
* try_cmpxchg based fetch_or() macro so it works for different integer types:
@@ -949,7 +954,7 @@ static inline void hrtick_rq_init(struct rq *rq)
_val; \
})
-#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
+#ifdef TIF_POLLING_NRFLAG
/*
* Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
* this avoids any races wrt polling state changes and thereby avoids
@@ -988,13 +993,11 @@ static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
return true;
}
-#ifdef CONFIG_SMP
static inline bool set_nr_if_polling(struct task_struct *p)
{
return false;
}
#endif
-#endif
static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
{
@@ -1167,7 +1170,6 @@ void resched_cpu(int cpu)
raw_spin_rq_unlock_irqrestore(rq, flags);
}
-#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ_COMMON
/*
* In the semi idle case, use the nearest busy CPU for migrating timers
@@ -1374,10 +1376,8 @@ bool sched_can_stop_tick(struct rq *rq)
return true;
}
#endif /* CONFIG_NO_HZ_FULL */
-#endif /* CONFIG_SMP */
-#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
- (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_FAIR_GROUP_SCHED)
/*
* Iterate task_group tree rooted at *from, calling @down when first entering a
* node and @up when leaving it for the final time.
@@ -1971,7 +1971,7 @@ undo:
sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
return result;
}
-#endif
+#endif /* CONFIG_SYSCTL */
static void uclamp_fork(struct task_struct *p)
{
@@ -2037,13 +2037,13 @@ static void __init init_uclamp(void)
}
}
-#else /* !CONFIG_UCLAMP_TASK */
+#else /* !CONFIG_UCLAMP_TASK: */
static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) { }
static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
static inline void uclamp_fork(struct task_struct *p) { }
static inline void uclamp_post_fork(struct task_struct *p) { }
static inline void init_uclamp(void) { }
-#endif /* CONFIG_UCLAMP_TASK */
+#endif /* !CONFIG_UCLAMP_TASK */
bool sched_task_on_rq(struct task_struct *p)
{
@@ -2353,8 +2353,6 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
return ncsw;
}
-#ifdef CONFIG_SMP
-
static void
__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
@@ -2936,8 +2934,15 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
struct set_affinity_pending my_pending = { }, *pending = NULL;
bool stop_pending, complete = false;
- /* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+ /*
+ * Can the task run on the task's current CPU? If so, we're done
+ *
+ * We are also done if the task is the current donor, boosting a lock-
+ * holding proxy, (and potentially has been migrated outside its
+ * current or previous affinity mask)
+ */
+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask) ||
+ (task_current_donor(rq, p) && !task_current(rq, p))) {
struct task_struct *push_task = NULL;
if ((flags & SCA_MIGRATE_ENABLE) &&
@@ -3305,6 +3310,8 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
WARN_ON_ONCE(ret);
}
+#ifdef CONFIG_SMP
+
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
unsigned int state = READ_ONCE(p->__state);
@@ -3358,14 +3365,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
__set_task_cpu(p, new_cpu);
}
+#endif /* CONFIG_SMP */
#ifdef CONFIG_NUMA_BALANCING
static void __migrate_swap_task(struct task_struct *p, int cpu)
{
- __schedstat_inc(p->stats.numa_task_swapped);
- count_vm_numa_event(NUMA_TASK_SWAP);
- count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
-
if (task_on_rq_queued(p)) {
struct rq *src_rq, *dst_rq;
struct rq_flags srf, drf;
@@ -3661,17 +3665,6 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
}
}
-#else /* CONFIG_SMP */
-
-static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
-
-static inline bool rq_has_pinned_tasks(struct rq *rq)
-{
- return false;
-}
-
-#endif /* !CONFIG_SMP */
-
static void
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
{
@@ -3682,7 +3675,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
rq = this_rq();
-#ifdef CONFIG_SMP
if (cpu == rq->cpu) {
__schedstat_inc(rq->ttwu_local);
__schedstat_inc(p->stats.nr_wakeups_local);
@@ -3702,7 +3694,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
if (wake_flags & WF_MIGRATED)
__schedstat_inc(p->stats.nr_wakeups_migrate);
-#endif /* CONFIG_SMP */
__schedstat_inc(rq->ttwu_count);
__schedstat_inc(p->stats.nr_wakeups);
@@ -3731,13 +3722,11 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
if (p->sched_contributes_to_load)
rq->nr_uninterruptible--;
-#ifdef CONFIG_SMP
if (wake_flags & WF_RQ_SELECTED)
en_flags |= ENQUEUE_RQ_SELECTED;
if (wake_flags & WF_MIGRATED)
en_flags |= ENQUEUE_MIGRATED;
else
-#endif
if (p->in_iowait) {
delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
@@ -3748,7 +3737,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
ttwu_do_wakeup(p);
-#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
* Our task @p is fully woken up and running; so it's safe to
@@ -3770,7 +3758,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
rq->idle_stamp = 0;
}
-#endif
}
/*
@@ -3824,7 +3811,6 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
return ret;
}
-#ifdef CONFIG_SMP
void sched_ttwu_pending(void *arg)
{
struct llist_node *llist = arg;
@@ -3891,7 +3877,9 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags
p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
WRITE_ONCE(rq->ttwu_pending, 1);
+#ifdef CONFIG_SMP
__smp_call_single_queue(cpu, &p->wake_entry.llist);
+#endif
}
void wake_up_if_idle(int cpu)
@@ -3943,6 +3931,11 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
if (!scx_allow_ttwu_queue(p))
return false;
+#ifdef CONFIG_SMP
+ if (p->sched_class == &stop_sched_class)
+ return false;
+#endif
+
/*
* Do not complicate things with the async wake_list while the CPU is
* in hotplug state.
@@ -3992,15 +3985,6 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
return false;
}
-#else /* !CONFIG_SMP */
-
-static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
-{
- return false;
-}
-
-#endif /* CONFIG_SMP */
-
static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
@@ -4256,7 +4240,6 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
break;
-#ifdef CONFIG_SMP
/*
* Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
* possible to, falsely, observe p->on_cpu == 0.
@@ -4335,9 +4318,6 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
psi_ttwu_dequeue(p);
set_task_cpu(p, cpu);
}
-#else
- cpu = task_cpu(p);
-#endif /* CONFIG_SMP */
ttwu_queue(p, cpu, wake_flags);
}
@@ -4370,14 +4350,12 @@ static bool __task_needs_rq_lock(struct task_struct *p)
if (p->on_rq)
return true;
-#ifdef CONFIG_SMP
/*
* Ensure the task has finished __schedule() and will not be referenced
* anymore. Again, see try_to_wake_up() for a longer comment.
*/
smp_rmb();
smp_cond_load_acquire(&p->on_cpu, !VAL);
-#endif
return false;
}
@@ -4533,10 +4511,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->capture_control = NULL;
#endif
init_numa_balancing(clone_flags, p);
-#ifdef CONFIG_SMP
p->wake_entry.u_flags = CSD_TYPE_TTWU;
p->migration_pending = NULL;
-#endif
init_sched_mm_cid(p);
}
@@ -4599,8 +4575,8 @@ static int sysctl_numa_balancing(const struct ctl_table *table, int write,
}
return err;
}
-#endif
-#endif
+#endif /* CONFIG_PROC_SYSCTL */
+#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_SCHEDSTATS
@@ -4787,14 +4763,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
#endif
-#if defined(CONFIG_SMP)
p->on_cpu = 0;
-#endif
init_task_preempt_count(p);
-#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
-#endif
+
return 0;
}
@@ -4871,7 +4844,6 @@ void wake_up_new_task(struct task_struct *p)
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
WRITE_ONCE(p->__state, TASK_RUNNING);
-#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
* - cpus_ptr can change in the fork path
@@ -4883,7 +4855,6 @@ void wake_up_new_task(struct task_struct *p)
p->recent_used_cpu = task_cpu(p);
rseq_migrate(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags));
-#endif
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
post_init_entity_util_avg(p);
@@ -4891,7 +4862,6 @@ void wake_up_new_task(struct task_struct *p)
activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL);
trace_sched_wakeup_new(p);
wakeup_preempt(rq, p, wake_flags);
-#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
* Nothing relies on rq->lock after this, so it's fine to
@@ -4901,7 +4871,6 @@ void wake_up_new_task(struct task_struct *p)
p->sched_class->task_woken(rq, p);
rq_repin_lock(rq, &rf);
}
-#endif
task_rq_unlock(rq, p, &rf);
}
@@ -4978,7 +4947,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
__fire_sched_out_preempt_notifiers(curr, next);
}
-#else /* !CONFIG_PREEMPT_NOTIFIERS */
+#else /* !CONFIG_PREEMPT_NOTIFIERS: */
static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
@@ -4990,11 +4959,10 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
{
}
-#endif /* CONFIG_PREEMPT_NOTIFIERS */
+#endif /* !CONFIG_PREEMPT_NOTIFIERS */
static inline void prepare_task(struct task_struct *next)
{
-#ifdef CONFIG_SMP
/*
* Claim the task as running, we do this before switching to it
* such that any running task will have this set.
@@ -5003,12 +4971,10 @@ static inline void prepare_task(struct task_struct *next)
* its ordering comment.
*/
WRITE_ONCE(next->on_cpu, 1);
-#endif
}
static inline void finish_task(struct task_struct *prev)
{
-#ifdef CONFIG_SMP
/*
* This must be the very last reference to @prev from this CPU. After
* p->on_cpu is cleared, the task can be moved to a different CPU. We
@@ -5021,11 +4987,8 @@ static inline void finish_task(struct task_struct *prev)
* Pairs with the smp_cond_load_acquire() in try_to_wake_up().
*/
smp_store_release(&prev->on_cpu, 0);
-#endif
}
-#ifdef CONFIG_SMP
-
static void do_balance_callbacks(struct rq *rq, struct balance_callback *head)
{
void (*func)(struct rq *rq);
@@ -5107,14 +5070,6 @@ void balance_callbacks(struct rq *rq, struct balance_callback *head)
}
}
-#else
-
-static inline void __balance_callbacks(struct rq *rq)
-{
-}
-
-#endif
-
static inline void
prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
{
@@ -5502,8 +5457,6 @@ unsigned int nr_iowait(void)
return sum;
}
-#ifdef CONFIG_SMP
-
/*
* sched_exec - execve() is a valuable balancing opportunity, because at
* this point the task has the smallest effective memory and cache footprint.
@@ -5527,8 +5480,6 @@ void sched_exec(void)
stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
}
-#endif
-
DEFINE_PER_CPU(struct kernel_stat, kstat);
DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
@@ -5563,7 +5514,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
struct rq *rq;
u64 ns;
-#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
+#ifdef CONFIG_64BIT
/*
* 64-bit doesn't need locks to atomically read a 64-bit value.
* So we have a optimization chance when the task's delta_exec is 0.
@@ -5690,12 +5641,10 @@ void sched_tick(void)
if (donor->flags & PF_WQ_WORKER)
wq_worker_tick(donor);
-#ifdef CONFIG_SMP
if (!scx_switched_all()) {
rq->idle_balance = idle_cpu(cpu);
sched_balance_trigger(rq);
}
-#endif
}
#ifdef CONFIG_NO_HZ_FULL
@@ -5835,10 +5784,10 @@ int __init sched_tick_offload_init(void)
return 0;
}
-#else /* !CONFIG_NO_HZ_FULL */
+#else /* !CONFIG_NO_HZ_FULL: */
static inline void sched_tick_start(int cpu) { }
static inline void sched_tick_stop(int cpu) { }
-#endif
+#endif /* !CONFIG_NO_HZ_FULL */
#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_TRACE_PREEMPT_TOGGLE))
@@ -6553,7 +6502,7 @@ static inline void sched_core_cpu_dying(unsigned int cpu)
rq->core = rq;
}
-#else /* !CONFIG_SCHED_CORE */
+#else /* !CONFIG_SCHED_CORE: */
static inline void sched_core_cpu_starting(unsigned int cpu) {}
static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
@@ -6565,7 +6514,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
return __pick_next_task(rq, prev, rf);
}
-#endif /* CONFIG_SCHED_CORE */
+#endif /* !CONFIG_SCHED_CORE */
/*
* Constants for the sched_mode argument of __schedule().
@@ -6581,11 +6530,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/*
* Helper function for __schedule()
*
- * If a task does not have signals pending, deactivate it
- * Otherwise marks the task's __state as RUNNING
+ * Tries to deactivate the task, unless the should_block arg
+ * is false or if a signal is pending. In the case a signal
+ * is pending, marks the task's __state as RUNNING (and clear
+ * blocked_on).
*/
static bool try_to_block_task(struct rq *rq, struct task_struct *p,
- unsigned long *task_state_p)
+ unsigned long *task_state_p, bool should_block)
{
unsigned long task_state = *task_state_p;
int flags = DEQUEUE_NOCLOCK;
@@ -6596,6 +6547,16 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
return false;
}
+ /*
+ * We check should_block after signal_pending because we
+ * will want to wake the task in that case. But if
+ * should_block is false, its likely due to the task being
+ * blocked on a mutex, and we want to keep it on the runqueue
+ * to be selectable for proxy-execution.
+ */
+ if (!should_block)
+ return false;
+
p->sched_contributes_to_load =
(task_state & TASK_UNINTERRUPTIBLE) &&
!(task_state & TASK_NOLOAD) &&
@@ -6619,6 +6580,194 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
return true;
}
+#ifdef CONFIG_SCHED_PROXY_EXEC
+static inline struct task_struct *proxy_resched_idle(struct rq *rq)
+{
+ put_prev_set_next_task(rq, rq->donor, rq->idle);
+ rq_set_donor(rq, rq->idle);
+ set_tsk_need_resched(rq->idle);
+ return rq->idle;
+}
+
+static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor)
+{
+ unsigned long state = READ_ONCE(donor->__state);
+
+ /* Don't deactivate if the state has been changed to TASK_RUNNING */
+ if (state == TASK_RUNNING)
+ return false;
+ /*
+ * Because we got donor from pick_next_task(), it is *crucial*
+ * that we call proxy_resched_idle() before we deactivate it.
+ * As once we deactivate donor, donor->on_rq is set to zero,
+ * which allows ttwu() to immediately try to wake the task on
+ * another rq. So we cannot use *any* references to donor
+ * after that point. So things like cfs_rq->curr or rq->donor
+ * need to be changed from next *before* we deactivate.
+ */
+ proxy_resched_idle(rq);
+ return try_to_block_task(rq, donor, &state, true);
+}
+
+static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *donor)
+{
+ if (!__proxy_deactivate(rq, donor)) {
+ /*
+ * XXX: For now, if deactivation failed, set donor
+ * as unblocked, as we aren't doing proxy-migrations
+ * yet (more logic will be needed then).
+ */
+ donor->blocked_on = NULL;
+ }
+ return NULL;
+}
+
+/*
+ * Find runnable lock owner to proxy for mutex blocked donor
+ *
+ * Follow the blocked-on relation:
+ * task->blocked_on -> mutex->owner -> task...
+ *
+ * Lock order:
+ *
+ * p->pi_lock
+ * rq->lock
+ * mutex->wait_lock
+ *
+ * Returns the task that is going to be used as execution context (the one
+ * that is actually going to be run on cpu_of(rq)).
+ */
+static struct task_struct *
+find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
+{
+ struct task_struct *owner = NULL;
+ int this_cpu = cpu_of(rq);
+ struct task_struct *p;
+ struct mutex *mutex;
+
+ /* Follow blocked_on chain. */
+ for (p = donor; task_is_blocked(p); p = owner) {
+ mutex = p->blocked_on;
+ /* Something changed in the chain, so pick again */
+ if (!mutex)
+ return NULL;
+ /*
+ * By taking mutex->wait_lock we hold off concurrent mutex_unlock()
+ * and ensure @owner sticks around.
+ */
+ guard(raw_spinlock)(&mutex->wait_lock);
+
+ /* Check again that p is blocked with wait_lock held */
+ if (mutex != __get_task_blocked_on(p)) {
+ /*
+ * Something changed in the blocked_on chain and
+ * we don't know if only at this level. So, let's
+ * just bail out completely and let __schedule()
+ * figure things out (pick_again loop).
+ */
+ return NULL;
+ }
+
+ owner = __mutex_owner(mutex);
+ if (!owner) {
+ __clear_task_blocked_on(p, mutex);
+ return p;
+ }
+
+ if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) {
+ /* XXX Don't handle blocked owners/delayed dequeue yet */
+ return proxy_deactivate(rq, donor);
+ }
+
+ if (task_cpu(owner) != this_cpu) {
+ /* XXX Don't handle migrations yet */
+ return proxy_deactivate(rq, donor);
+ }
+
+ if (task_on_rq_migrating(owner)) {
+ /*
+ * One of the chain of mutex owners is currently migrating to this
+ * CPU, but has not yet been enqueued because we are holding the
+ * rq lock. As a simple solution, just schedule rq->idle to give
+ * the migration a chance to complete. Much like the migrate_task
+ * case we should end up back in find_proxy_task(), this time
+ * hopefully with all relevant tasks already enqueued.
+ */
+ return proxy_resched_idle(rq);
+ }
+
+ /*
+ * Its possible to race where after we check owner->on_rq
+ * but before we check (owner_cpu != this_cpu) that the
+ * task on another cpu was migrated back to this cpu. In
+ * that case it could slip by our checks. So double check
+ * we are still on this cpu and not migrating. If we get
+ * inconsistent results, try again.
+ */
+ if (!task_on_rq_queued(owner) || task_cpu(owner) != this_cpu)
+ return NULL;
+
+ if (owner == p) {
+ /*
+ * It's possible we interleave with mutex_unlock like:
+ *
+ * lock(&rq->lock);
+ * find_proxy_task()
+ * mutex_unlock()
+ * lock(&wait_lock);
+ * donor(owner) = current->blocked_donor;
+ * unlock(&wait_lock);
+ *
+ * wake_up_q();
+ * ...
+ * ttwu_runnable()
+ * __task_rq_lock()
+ * lock(&wait_lock);
+ * owner == p
+ *
+ * Which leaves us to finish the ttwu_runnable() and make it go.
+ *
+ * So schedule rq->idle so that ttwu_runnable() can get the rq
+ * lock and mark owner as running.
+ */
+ return proxy_resched_idle(rq);
+ }
+ /*
+ * OK, now we're absolutely sure @owner is on this
+ * rq, therefore holding @rq->lock is sufficient to
+ * guarantee its existence, as per ttwu_remote().
+ */
+ }
+
+ WARN_ON_ONCE(owner && !owner->on_rq);
+ return owner;
+}
+#else /* SCHED_PROXY_EXEC */
+static struct task_struct *
+find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
+{
+ WARN_ONCE(1, "This should never be called in the !SCHED_PROXY_EXEC case\n");
+ return donor;
+}
+#endif /* SCHED_PROXY_EXEC */
+
+static inline void proxy_tag_curr(struct rq *rq, struct task_struct *owner)
+{
+ if (!sched_proxy_exec())
+ return;
+ /*
+ * pick_next_task() calls set_next_task() on the chosen task
+ * at some point, which ensures it is not push/pullable.
+ * However, the chosen/donor task *and* the mutex owner form an
+ * atomic pair wrt push/pull.
+ *
+ * Make sure owner we run is not pushable. Unfortunately we can
+ * only deal with that by means of a dequeue/enqueue cycle. :-/
+ */
+ dequeue_task(rq, owner, DEQUEUE_NOCLOCK | DEQUEUE_SAVE);
+ enqueue_task(rq, owner, ENQUEUE_NOCLOCK | ENQUEUE_RESTORE);
+}
+
/*
* __schedule() is the main scheduler function.
*
@@ -6731,15 +6880,31 @@ static void __sched notrace __schedule(int sched_mode)
goto picked;
}
} else if (!preempt && prev_state) {
- try_to_block_task(rq, prev, &prev_state);
+ /*
+ * We pass task_is_blocked() as the should_block arg
+ * in order to keep mutex-blocked tasks on the runqueue
+ * for slection with proxy-exec (without proxy-exec
+ * task_is_blocked() will always be false).
+ */
+ try_to_block_task(rq, prev, &prev_state,
+ !task_is_blocked(prev));
switch_count = &prev->nvcsw;
}
- next = pick_next_task(rq, prev, &rf);
+pick_again:
+ next = pick_next_task(rq, rq->donor, &rf);
rq_set_donor(rq, next);
+ if (unlikely(task_is_blocked(next))) {
+ next = find_proxy_task(rq, next, &rf);
+ if (!next)
+ goto pick_again;
+ if (next == rq->idle)
+ goto keep_resched;
+ }
picked:
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
+keep_resched:
rq->last_seen_need_resched_ns = 0;
is_switch = prev != next;
@@ -6750,6 +6915,10 @@ picked:
* changes to task_struct made by pick_next_task().
*/
RCU_INIT_POINTER(rq->curr, next);
+
+ if (!task_current_donor(rq, next))
+ proxy_tag_curr(rq, next);
+
/*
* The membarrier system call requires each architecture
* to have a full memory barrier after updating
@@ -6784,6 +6953,10 @@ picked:
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
} else {
+ /* In case next was already curr but just got blocked_donor */
+ if (!task_current_donor(rq, next))
+ proxy_tag_curr(rq, next);
+
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
raw_spin_rq_unlock_irq(rq);
@@ -6992,14 +7165,14 @@ NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
#ifdef CONFIG_PREEMPT_DYNAMIC
-#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
-#ifndef preempt_schedule_dynamic_enabled
-#define preempt_schedule_dynamic_enabled preempt_schedule
-#define preempt_schedule_dynamic_disabled NULL
-#endif
+# ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL
+# ifndef preempt_schedule_dynamic_enabled
+# define preempt_schedule_dynamic_enabled preempt_schedule
+# define preempt_schedule_dynamic_disabled NULL
+# endif
DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled);
EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
-#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule);
void __sched notrace dynamic_preempt_schedule(void)
{
@@ -7009,8 +7182,8 @@ void __sched notrace dynamic_preempt_schedule(void)
}
NOKPROBE_SYMBOL(dynamic_preempt_schedule);
EXPORT_SYMBOL(dynamic_preempt_schedule);
-#endif
-#endif
+# endif
+#endif /* CONFIG_PREEMPT_DYNAMIC */
/**
* preempt_schedule_notrace - preempt_schedule called by tracing
@@ -7065,14 +7238,14 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
#ifdef CONFIG_PREEMPT_DYNAMIC
-#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
-#ifndef preempt_schedule_notrace_dynamic_enabled
-#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace
-#define preempt_schedule_notrace_dynamic_disabled NULL
-#endif
+# if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+# ifndef preempt_schedule_notrace_dynamic_enabled
+# define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace
+# define preempt_schedule_notrace_dynamic_disabled NULL
+# endif
DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled);
EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
-#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace);
void __sched notrace dynamic_preempt_schedule_notrace(void)
{
@@ -7082,7 +7255,7 @@ void __sched notrace dynamic_preempt_schedule_notrace(void)
}
NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace);
EXPORT_SYMBOL(dynamic_preempt_schedule_notrace);
-#endif
+# endif
#endif
#endif /* CONFIG_PREEMPTION */
@@ -7301,7 +7474,7 @@ out_unlock:
preempt_enable();
}
-#endif
+#endif /* CONFIG_RT_MUTEXES */
#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
int __sched __cond_resched(void)
@@ -7332,17 +7505,17 @@ EXPORT_SYMBOL(__cond_resched);
#endif
#ifdef CONFIG_PREEMPT_DYNAMIC
-#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
-#define cond_resched_dynamic_enabled __cond_resched
-#define cond_resched_dynamic_disabled ((void *)&__static_call_return0)
+# ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL
+# define cond_resched_dynamic_enabled __cond_resched
+# define cond_resched_dynamic_disabled ((void *)&__static_call_return0)
DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
EXPORT_STATIC_CALL_TRAMP(cond_resched);
-#define might_resched_dynamic_enabled __cond_resched
-#define might_resched_dynamic_disabled ((void *)&__static_call_return0)
+# define might_resched_dynamic_enabled __cond_resched
+# define might_resched_dynamic_disabled ((void *)&__static_call_return0)
DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
EXPORT_STATIC_CALL_TRAMP(might_resched);
-#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
int __sched dynamic_cond_resched(void)
{
@@ -7360,8 +7533,8 @@ int __sched dynamic_might_resched(void)
return __cond_resched();
}
EXPORT_SYMBOL(dynamic_might_resched);
-#endif
-#endif
+# endif
+#endif /* CONFIG_PREEMPT_DYNAMIC */
/*
* __cond_resched_lock() - if a reschedule is pending, drop the given lock,
@@ -7427,9 +7600,9 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
#ifdef CONFIG_PREEMPT_DYNAMIC
-#ifdef CONFIG_GENERIC_ENTRY
-#include <linux/entry-common.h>
-#endif
+# ifdef CONFIG_GENERIC_IRQ_ENTRY
+# include <linux/irq-entry-common.h>
+# endif
/*
* SC:cond_resched
@@ -7484,37 +7657,37 @@ int preempt_dynamic_mode = preempt_dynamic_undefined;
int sched_dynamic_mode(const char *str)
{
-#ifndef CONFIG_PREEMPT_RT
+# ifndef CONFIG_PREEMPT_RT
if (!strcmp(str, "none"))
return preempt_dynamic_none;
if (!strcmp(str, "voluntary"))
return preempt_dynamic_voluntary;
-#endif
+# endif
if (!strcmp(str, "full"))
return preempt_dynamic_full;
-#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
+# ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
if (!strcmp(str, "lazy"))
return preempt_dynamic_lazy;
-#endif
+# endif
return -EINVAL;
}
-#define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key)
-#define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key)
+# define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key)
+# define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key)
-#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
-#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled)
-#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled)
-#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
-#define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f)
-#define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f)
-#else
-#error "Unsupported PREEMPT_DYNAMIC mechanism"
-#endif
+# if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+# define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled)
+# define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled)
+# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+# define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f)
+# define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f)
+# else
+# error "Unsupported PREEMPT_DYNAMIC mechanism"
+# endif
static DEFINE_MUTEX(sched_dynamic_mutex);
@@ -7618,7 +7791,7 @@ static void __init preempt_dynamic_init(void)
}
}
-#define PREEMPT_MODEL_ACCESSOR(mode) \
+# define PREEMPT_MODEL_ACCESSOR(mode) \
bool preempt_model_##mode(void) \
{ \
WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \
@@ -7663,7 +7836,7 @@ const char *preempt_model_str(void)
if (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC)) {
seq_buf_printf(&s, "(%s)%s",
- preempt_dynamic_mode > 0 ?
+ preempt_dynamic_mode >= 0 ?
preempt_modes[preempt_dynamic_mode] : "undef",
brace ? "}" : "");
return seq_buf_str(&s);
@@ -7819,12 +7992,10 @@ void show_state_filter(unsigned int state_filter)
*/
void __init init_idle(struct task_struct *idle, int cpu)
{
-#ifdef CONFIG_SMP
struct affinity_context ac = (struct affinity_context) {
.new_mask = cpumask_of(cpu),
.flags = 0,
};
-#endif
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
@@ -7840,13 +8011,11 @@ void __init init_idle(struct task_struct *idle, int cpu)
idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY;
kthread_set_per_cpu(idle, cpu);
-#ifdef CONFIG_SMP
/*
* No validation and serialization required at boot time and for
* setting up the idle tasks of not yet online CPUs.
*/
set_cpus_allowed_common(idle, &ac);
-#endif
/*
* We're having a chicken and egg problem, even though we are
* holding rq->lock, the CPU isn't yet set to this CPU so the
@@ -7865,9 +8034,7 @@ void __init init_idle(struct task_struct *idle, int cpu)
rq_set_donor(rq, idle);
rcu_assign_pointer(rq->curr, idle);
idle->on_rq = TASK_ON_RQ_QUEUED;
-#ifdef CONFIG_SMP
idle->on_cpu = 1;
-#endif
raw_spin_rq_unlock(rq);
raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
@@ -7880,13 +8047,9 @@ void __init init_idle(struct task_struct *idle, int cpu)
idle->sched_class = &idle_sched_class;
ftrace_graph_init_idle_task(idle, cpu);
vtime_init_idle(idle, cpu);
-#ifdef CONFIG_SMP
sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
-#endif
}
-#ifdef CONFIG_SMP
-
int cpuset_cpumask_can_shrink(const struct cpumask *cur,
const struct cpumask *trial)
{
@@ -7934,9 +8097,8 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
return -EINVAL;
- __schedstat_inc(p->stats.numa_task_migrated);
- count_vm_numa_event(NUMA_TASK_MIGRATE);
- count_memcg_event_mm(p->mm, NUMA_TASK_MIGRATE);
+ /* TODO: This is not properly updating schedstats */
+
trace_sched_move_numa(p, curr_cpu, target_cpu);
return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
}
@@ -8123,7 +8285,7 @@ static void balance_hotplug_wait(void)
TASK_UNINTERRUPTIBLE);
}
-#else
+#else /* !CONFIG_HOTPLUG_CPU: */
static inline void balance_push(struct rq *rq)
{
@@ -8137,7 +8299,7 @@ static inline void balance_hotplug_wait(void)
{
}
-#endif /* CONFIG_HOTPLUG_CPU */
+#endif /* !CONFIG_HOTPLUG_CPU */
void set_rq_online(struct rq *rq)
{
@@ -8446,7 +8608,7 @@ int sched_cpu_dying(unsigned int cpu)
sched_core_cpu_dying(cpu);
return 0;
}
-#endif
+#endif /* CONFIG_HOTPLUG_CPU */
void __init sched_init_smp(void)
{
@@ -8470,6 +8632,8 @@ void __init sched_init_smp(void)
init_sched_rt_class();
init_sched_dl_class();
+ sched_init_dl_servers();
+
sched_smp_initialized = true;
}
@@ -8480,13 +8644,6 @@ static int __init migration_init(void)
}
early_initcall(migration_init);
-#else
-void __init sched_init_smp(void)
-{
- sched_init_granularity();
-}
-#endif /* CONFIG_SMP */
-
int in_sched_functions(unsigned long addr)
{
return in_lock_functions(addr) ||
@@ -8512,9 +8669,7 @@ void __init sched_init(void)
int i;
/* Make sure the linker didn't screw up */
-#ifdef CONFIG_SMP
BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class));
-#endif
BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class));
BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class));
BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class));
@@ -8545,7 +8700,7 @@ void __init sched_init(void)
init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_EXT_GROUP_SCHED
- root_task_group.scx_weight = CGROUP_WEIGHT_DFL;
+ scx_tg_init(&root_task_group);
#endif /* CONFIG_EXT_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -8557,9 +8712,7 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
}
-#ifdef CONFIG_SMP
init_defrootdomain();
-#endif
#ifdef CONFIG_RT_GROUP_SCHED
init_rt_bandwidth(&root_task_group.rt_bandwidth,
@@ -8620,7 +8773,6 @@ void __init sched_init(void)
rq->rt.rt_runtime = global_rt_runtime();
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
-#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
rq->cpu_capacity = SCHED_CAPACITY_SCALE;
@@ -8646,7 +8798,6 @@ void __init sched_init(void)
#ifdef CONFIG_HOTPLUG_CPU
rcuwait_init(&rq->hotplug_wait);
#endif
-#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
fair_server_init(rq);
@@ -8694,10 +8845,9 @@ void __init sched_init(void)
calc_load_update = jiffies + LOAD_FREQ;
-#ifdef CONFIG_SMP
idle_thread_set_boot_cpu();
+
balance_push_set(smp_processor_id(), false);
-#endif
init_sched_fair_class();
init_sched_ext_class();
@@ -8830,7 +8980,7 @@ void __cant_sleep(const char *file, int line, int preempt_offset)
}
EXPORT_SYMBOL_GPL(__cant_sleep);
-#ifdef CONFIG_SMP
+# ifdef CONFIG_SMP
void __cant_migrate(const char *file, int line)
{
static unsigned long prev_jiffy;
@@ -8861,8 +9011,8 @@ void __cant_migrate(const char *file, int line)
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
EXPORT_SYMBOL_GPL(__cant_migrate);
-#endif
-#endif
+# endif /* CONFIG_SMP */
+#endif /* CONFIG_DEBUG_ATOMIC_SLEEP */
#ifdef CONFIG_MAGIC_SYSRQ
void normalize_rt_tasks(void)
@@ -8902,7 +9052,7 @@ void normalize_rt_tasks(void)
#endif /* CONFIG_MAGIC_SYSRQ */
-#if defined(CONFIG_KGDB_KDB)
+#ifdef CONFIG_KGDB_KDB
/*
* These functions are only useful for KDB.
*
@@ -8926,7 +9076,7 @@ struct task_struct *curr_task(int cpu)
return cpu_curr(cpu);
}
-#endif /* defined(CONFIG_KGDB_KDB) */
+#endif /* CONFIG_KGDB_KDB */
#ifdef CONFIG_CGROUP_SCHED
/* task_group_lock serializes the addition/removal of task groups */
@@ -8985,7 +9135,7 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;
- scx_group_set_weight(tg, CGROUP_WEIGHT_DFL);
+ scx_tg_init(tg);
alloc_uclamp_sched_group(tg, parent);
return tg;
@@ -9422,47 +9572,23 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
#ifdef CONFIG_CFS_BANDWIDTH
static DEFINE_MUTEX(cfs_constraints_mutex);
-const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
-static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
-/* More than 203 days if BW_SHIFT equals 20. */
-static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
-
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
-static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
- u64 burst)
+static int tg_set_cfs_bandwidth(struct task_group *tg,
+ u64 period_us, u64 quota_us, u64 burst_us)
{
int i, ret = 0, runtime_enabled, runtime_was_enabled;
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+ u64 period, quota, burst;
- if (tg == &root_task_group)
- return -EINVAL;
-
- /*
- * Ensure we have at some amount of bandwidth every period. This is
- * to prevent reaching a state of large arrears when throttled via
- * entity_tick() resulting in prolonged exit starvation.
- */
- if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
- return -EINVAL;
-
- /*
- * Likewise, bound things on the other side by preventing insane quota
- * periods. This also allows us to normalize in computing quota
- * feasibility.
- */
- if (period > max_cfs_quota_period)
- return -EINVAL;
+ period = (u64)period_us * NSEC_PER_USEC;
- /*
- * Bound quota to defend quota against overflow during bandwidth shift.
- */
- if (quota != RUNTIME_INF && quota > max_cfs_runtime)
- return -EINVAL;
+ if (quota_us == RUNTIME_INF)
+ quota = RUNTIME_INF;
+ else
+ quota = (u64)quota_us * NSEC_PER_USEC;
- if (quota != RUNTIME_INF && (burst > quota ||
- burst + quota > max_cfs_runtime))
- return -EINVAL;
+ burst = (u64)burst_us * NSEC_PER_USEC;
/*
* Prevent race between setting of cfs_rq->runtime_enabled and
@@ -9517,28 +9643,22 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
return 0;
}
-static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
+static u64 tg_get_cfs_period(struct task_group *tg)
{
- u64 quota, period, burst;
+ u64 cfs_period_us;
- period = ktime_to_ns(tg->cfs_bandwidth.period);
- burst = tg->cfs_bandwidth.burst;
- if (cfs_quota_us < 0)
- quota = RUNTIME_INF;
- else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
- quota = (u64)cfs_quota_us * NSEC_PER_USEC;
- else
- return -EINVAL;
+ cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
+ do_div(cfs_period_us, NSEC_PER_USEC);
- return tg_set_cfs_bandwidth(tg, period, quota, burst);
+ return cfs_period_us;
}
-static long tg_get_cfs_quota(struct task_group *tg)
+static u64 tg_get_cfs_quota(struct task_group *tg)
{
u64 quota_us;
if (tg->cfs_bandwidth.quota == RUNTIME_INF)
- return -1;
+ return RUNTIME_INF;
quota_us = tg->cfs_bandwidth.quota;
do_div(quota_us, NSEC_PER_USEC);
@@ -9546,45 +9666,7 @@ static long tg_get_cfs_quota(struct task_group *tg)
return quota_us;
}
-static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
-{
- u64 quota, period, burst;
-
- if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
- return -EINVAL;
-
- period = (u64)cfs_period_us * NSEC_PER_USEC;
- quota = tg->cfs_bandwidth.quota;
- burst = tg->cfs_bandwidth.burst;
-
- return tg_set_cfs_bandwidth(tg, period, quota, burst);
-}
-
-static long tg_get_cfs_period(struct task_group *tg)
-{
- u64 cfs_period_us;
-
- cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
- do_div(cfs_period_us, NSEC_PER_USEC);
-
- return cfs_period_us;
-}
-
-static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us)
-{
- u64 quota, period, burst;
-
- if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC)
- return -EINVAL;
-
- burst = (u64)cfs_burst_us * NSEC_PER_USEC;
- period = ktime_to_ns(tg->cfs_bandwidth.period);
- quota = tg->cfs_bandwidth.quota;
-
- return tg_set_cfs_bandwidth(tg, period, quota, burst);
-}
-
-static long tg_get_cfs_burst(struct task_group *tg)
+static u64 tg_get_cfs_burst(struct task_group *tg)
{
u64 burst_us;
@@ -9594,42 +9676,6 @@ static long tg_get_cfs_burst(struct task_group *tg)
return burst_us;
}
-static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return tg_get_cfs_quota(css_tg(css));
-}
-
-static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
- struct cftype *cftype, s64 cfs_quota_us)
-{
- return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
-}
-
-static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return tg_get_cfs_period(css_tg(css));
-}
-
-static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
- struct cftype *cftype, u64 cfs_period_us)
-{
- return tg_set_cfs_period(css_tg(css), cfs_period_us);
-}
-
-static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return tg_get_cfs_burst(css_tg(css));
-}
-
-static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css,
- struct cftype *cftype, u64 cfs_burst_us)
-{
- return tg_set_cfs_burst(css_tg(css), cfs_burst_us);
-}
-
struct cfs_schedulable_data {
struct task_group *tg;
u64 period, quota;
@@ -9762,6 +9808,126 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
return 0;
}
+
+const u64 max_bw_quota_period_us = 1 * USEC_PER_SEC; /* 1s */
+static const u64 min_bw_quota_period_us = 1 * USEC_PER_MSEC; /* 1ms */
+/* More than 203 days if BW_SHIFT equals 20. */
+static const u64 max_bw_runtime_us = MAX_BW;
+
+static void tg_bandwidth(struct task_group *tg,
+ u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p)
+{
+ if (period_us_p)
+ *period_us_p = tg_get_cfs_period(tg);
+ if (quota_us_p)
+ *quota_us_p = tg_get_cfs_quota(tg);
+ if (burst_us_p)
+ *burst_us_p = tg_get_cfs_burst(tg);
+}
+
+static u64 cpu_period_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 period_us;
+
+ tg_bandwidth(css_tg(css), &period_us, NULL, NULL);
+ return period_us;
+}
+
+static int tg_set_bandwidth(struct task_group *tg,
+ u64 period_us, u64 quota_us, u64 burst_us)
+{
+ const u64 max_usec = U64_MAX / NSEC_PER_USEC;
+
+ if (tg == &root_task_group)
+ return -EINVAL;
+
+ /* Values should survive translation to nsec */
+ if (period_us > max_usec ||
+ (quota_us != RUNTIME_INF && quota_us > max_usec) ||
+ burst_us > max_usec)
+ return -EINVAL;
+
+ /*
+ * Ensure we have some amount of bandwidth every period. This is to
+ * prevent reaching a state of large arrears when throttled via
+ * entity_tick() resulting in prolonged exit starvation.
+ */
+ if (quota_us < min_bw_quota_period_us ||
+ period_us < min_bw_quota_period_us)
+ return -EINVAL;
+
+ /*
+ * Likewise, bound things on the other side by preventing insane quota
+ * periods. This also allows us to normalize in computing quota
+ * feasibility.
+ */
+ if (period_us > max_bw_quota_period_us)
+ return -EINVAL;
+
+ /*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (quota_us != RUNTIME_INF && quota_us > max_bw_runtime_us)
+ return -EINVAL;
+
+ if (quota_us != RUNTIME_INF && (burst_us > quota_us ||
+ burst_us + quota_us > max_bw_runtime_us))
+ return -EINVAL;
+
+ return tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us);
+}
+
+static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 quota_us;
+
+ tg_bandwidth(css_tg(css), NULL, &quota_us, NULL);
+ return quota_us; /* (s64)RUNTIME_INF becomes -1 */
+}
+
+static u64 cpu_burst_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 burst_us;
+
+ tg_bandwidth(css_tg(css), NULL, NULL, &burst_us);
+ return burst_us;
+}
+
+static int cpu_period_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 period_us)
+{
+ struct task_group *tg = css_tg(css);
+ u64 quota_us, burst_us;
+
+ tg_bandwidth(tg, NULL, &quota_us, &burst_us);
+ return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
+}
+
+static int cpu_quota_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, s64 quota_us)
+{
+ struct task_group *tg = css_tg(css);
+ u64 period_us, burst_us;
+
+ if (quota_us < 0)
+ quota_us = RUNTIME_INF;
+
+ tg_bandwidth(tg, &period_us, NULL, &burst_us);
+ return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
+}
+
+static int cpu_burst_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 burst_us)
+{
+ struct task_group *tg = css_tg(css);
+ u64 period_us, quota_us;
+
+ tg_bandwidth(tg, &period_us, &quota_us, NULL);
+ return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
+}
#endif /* CONFIG_CFS_BANDWIDTH */
#ifdef CONFIG_RT_GROUP_SCHED
@@ -9807,7 +9973,7 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
scx_group_set_idle(css_tg(css), idle);
return ret;
}
-#endif
+#endif /* CONFIG_GROUP_SCHED_WEIGHT */
static struct cftype cpu_legacy_files[] = {
#ifdef CONFIG_GROUP_SCHED_WEIGHT
@@ -9824,19 +9990,19 @@ static struct cftype cpu_legacy_files[] = {
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
- .name = "cfs_quota_us",
- .read_s64 = cpu_cfs_quota_read_s64,
- .write_s64 = cpu_cfs_quota_write_s64,
+ .name = "cfs_period_us",
+ .read_u64 = cpu_period_read_u64,
+ .write_u64 = cpu_period_write_u64,
},
{
- .name = "cfs_period_us",
- .read_u64 = cpu_cfs_period_read_u64,
- .write_u64 = cpu_cfs_period_write_u64,
+ .name = "cfs_quota_us",
+ .read_s64 = cpu_quota_read_s64,
+ .write_s64 = cpu_quota_write_s64,
},
{
.name = "cfs_burst_us",
- .read_u64 = cpu_cfs_burst_read_u64,
- .write_u64 = cpu_cfs_burst_write_u64,
+ .read_u64 = cpu_burst_read_u64,
+ .write_u64 = cpu_burst_write_u64,
},
{
.name = "stat",
@@ -9935,7 +10101,7 @@ static int cpu_extra_stat_show(struct seq_file *sf,
cfs_b->nr_periods, cfs_b->nr_throttled,
throttled_usec, cfs_b->nr_burst, burst_usec);
}
-#endif
+#endif /* CONFIG_CFS_BANDWIDTH */
return 0;
}
@@ -10033,22 +10199,20 @@ static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
}
/* caller should put the current value in *@periodp before calling */
-static int __maybe_unused cpu_period_quota_parse(char *buf,
- u64 *periodp, u64 *quotap)
+static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *period_us_p,
+ u64 *quota_us_p)
{
char tok[21]; /* U64_MAX */
- if (sscanf(buf, "%20s %llu", tok, periodp) < 1)
+ if (sscanf(buf, "%20s %llu", tok, period_us_p) < 1)
return -EINVAL;
- *periodp *= NSEC_PER_USEC;
-
- if (sscanf(tok, "%llu", quotap))
- *quotap *= NSEC_PER_USEC;
- else if (!strcmp(tok, "max"))
- *quotap = RUNTIME_INF;
- else
- return -EINVAL;
+ if (sscanf(tok, "%llu", quota_us_p) < 1) {
+ if (!strcmp(tok, "max"))
+ *quota_us_p = RUNTIME_INF;
+ else
+ return -EINVAL;
+ }
return 0;
}
@@ -10057,8 +10221,10 @@ static int __maybe_unused cpu_period_quota_parse(char *buf,
static int cpu_max_show(struct seq_file *sf, void *v)
{
struct task_group *tg = css_tg(seq_css(sf));
+ u64 period_us, quota_us;
- cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
+ tg_bandwidth(tg, &period_us, &quota_us, NULL);
+ cpu_period_quota_print(sf, period_us, quota_us);
return 0;
}
@@ -10066,17 +10232,16 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct task_group *tg = css_tg(of_css(of));
- u64 period = tg_get_cfs_period(tg);
- u64 burst = tg->cfs_bandwidth.burst;
- u64 quota;
+ u64 period_us, quota_us, burst_us;
int ret;
- ret = cpu_period_quota_parse(buf, &period, &quota);
+ tg_bandwidth(tg, &period_us, NULL, &burst_us);
+ ret = cpu_period_quota_parse(buf, &period_us, &quota_us);
if (!ret)
- ret = tg_set_cfs_bandwidth(tg, period, quota, burst);
+ ret = tg_set_bandwidth(tg, period_us, quota_us, burst_us);
return ret ?: nbytes;
}
-#endif
+#endif /* CONFIG_CFS_BANDWIDTH */
static struct cftype cpu_files[] = {
#ifdef CONFIG_GROUP_SCHED_WEIGHT
@@ -10109,10 +10274,10 @@ static struct cftype cpu_files[] = {
{
.name = "max.burst",
.flags = CFTYPE_NOT_ON_ROOT,
- .read_u64 = cpu_cfs_burst_read_u64,
- .write_u64 = cpu_cfs_burst_write_u64,
+ .read_u64 = cpu_burst_read_u64,
+ .write_u64 = cpu_burst_write_u64,
},
-#endif
+#endif /* CONFIG_CFS_BANDWIDTH */
#ifdef CONFIG_UCLAMP_TASK_GROUP
{
.name = "uclamp.min",
@@ -10126,7 +10291,7 @@ static struct cftype cpu_files[] = {
.seq_show = cpu_uclamp_max_show,
.write = cpu_uclamp_max_write,
},
-#endif
+#endif /* CONFIG_UCLAMP_TASK_GROUP */
{ } /* terminate */
};
@@ -10147,7 +10312,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.threaded = true,
};
-#endif /* CONFIG_CGROUP_SCHED */
+#endif /* CONFIG_CGROUP_SCHED */
void dump_cpu_task(int cpu)
{
@@ -10733,7 +10898,7 @@ void sched_mm_cid_fork(struct task_struct *t)
WARN_ON_ONCE(!t->mm || t->mm_cid != -1);
t->mm_cid_active = 1;
}
-#endif
+#endif /* CONFIG_SCHED_MM_CID */
#ifdef CONFIG_SCHED_CLASS_EXT
void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
@@ -10768,4 +10933,4 @@ void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
if (ctx->running)
set_next_task(rq, ctx->p);
}
-#endif /* CONFIG_SCHED_CLASS_EXT */
+#endif /* CONFIG_SCHED_CLASS_EXT */