Merge tag 'sched-core-2025-07-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar: "Core scheduler changes: - Better tracking of maximum lag of tasks in presence of different slices duration, for better handling of lag in the fair scheduler (Vincent Guittot) - Clean up and standardize #if/#else/#endif markers throughout the entire scheduler code base (Ingo Molnar) - Make SMP unconditional: build the SMP scheduler's data structures and logic on UP kernel too, even though they are not used, to simplify the scheduler and remove around 200 #ifdef/[#else]/#endif blocks from the scheduler (Ingo Molnar) - Reorganize cgroup bandwidth control interface handling for better interfacing with sched_ext (Tejun Heo) Balancing: - Bump sd->max_newidle_lb_cost when newidle balance fails (Chris Mason) - Remove sched_domain_topology_level::flags to simplify the code (Prateek Nayak) - Simplify and clean up build_sched_topology() (Li Chen) - Optimize build_sched_topology() on large machines (Li Chen) Real-time scheduling: - Add initial version of proxy execution: a mechanism for mutex-owning tasks to inherit the scheduling context of higher priority waiters. Currently limited to a single runqueue and conditional on CONFIG_EXPERT, and other limitations (John Stultz, Peter Zijlstra, Valentin Schneider) - Deadline scheduler (Juri Lelli): - Fix dl_servers initialization order (Juri Lelli) - Fix DL scheduler's root domain reinitialization logic (Juri Lelli) - Fix accounting bugs after global limits change (Juri Lelli) - Fix scalability regression by implementing less agressive dl_server handling (Peter Zijlstra) PSI: - Improve scalability by optimizing psi_group_change() cpu_clock() usage (Peter Zijlstra) Rust changes: - Make Task, CondVar and PollCondVar methods inline to avoid unnecessary function calls (Kunwu Chan, Panagiotis Foliadis) - Add might_sleep() support for Rust code: Rust's "#[track_caller]" mechanism is used so that Rust's might_sleep() doesn't need to be defined as a macro (Fujita Tomonori) - Introduce file_from_location() (Boqun Feng) Debugging & instrumentation: - Make clangd usable with scheduler source code files again (Peter Zijlstra) - tools: Add root_domains_dump.py which dumps root domains info (Juri Lelli) - tools: Add dl_bw_dump.py for printing bandwidth accounting info (Juri Lelli) Misc cleanups & fixes: - Remove play_idle() (Feng Lee) - Fix check_preemption_disabled() (Sebastian Andrzej Siewior) - Do not call __put_task_struct() on RT if pi_blocked_on is set (Luis Claudio R. Goncalves) - Correct the comment in place_entity() (wang wei)" * tag 'sched-core-2025-07-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (84 commits) sched/idle: Remove play_idle() sched: Do not call __put_task_struct() on rt if pi_blocked_on is set sched: Start blocked_on chain processing in find_proxy_task() sched: Fix proxy/current (push,pull)ability sched: Add an initial sketch of the find_proxy_task() function sched: Fix runtime accounting w/ split exec & sched contexts sched: Move update_curr_task logic into update_curr_se locking/mutex: Add p->blocked_on wrappers for correctness checks locking/mutex: Rework task_struct::blocked_on sched: Add CONFIG_SCHED_PROXY_EXEC & boot argument to enable/disable sched/topology: Remove sched_domain_topology_level::flags x86/smpboot: avoid SMT domain attach/destroy if SMT is not enabled x86/smpboot: moves x86_topology to static initialize and truncate x86/smpboot: remove redundant CONFIG_SCHED_SMT smpboot: introduce SDTL_INIT() helper to tidy sched topology setup tools/sched: Add dl_bw_dump.py for printing bandwidth accounting info tools/sched: Add root_domains_dump.py which dumps root domains info sched/deadline: Fix accounting after global limits change sched/deadline: Reset extra_bw to max_bw when clearing root domains sched/deadline: Initialize dl_servers after SMP ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2025-07-29 17:42:52 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2025-07-29 17:42:52 -0700
commit: bf76f23aa1c178e9115eba17f699fa726aed669b (patch)
tree: a88270238eae8a0e6d98225c7de4b06bc2d2fb37 /include/linux/sched.h
parent: 14bed9bc81bae64db98349319f367bfc7dab0afd (diff)
parent: 1b5f1454091e9e9fb5c944b3161acf4ec0894d0d (diff)
1 files changed, 86 insertions, 62 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 63a9b6e23b07..40d2fa90df42 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -34,6 +34,7 @@
 #include <linux/sched/prio.h>
 #include <linux/sched/types.h>
 #include <linux/signal_types.h>
+#include <linux/spinlock.h>
 #include <linux/syscall_user_dispatch_types.h>
 #include <linux/mm_types_task.h>
 #include <linux/netdevice_xmit.h>
@@ -395,15 +396,10 @@ enum uclamp_id {
 	UCLAMP_CNT
 };
 
-#ifdef CONFIG_SMP
 extern struct root_domain def_root_domain;
 extern struct mutex sched_domains_mutex;
 extern void sched_domains_mutex_lock(void);
 extern void sched_domains_mutex_unlock(void);
-#else
-static inline void sched_domains_mutex_lock(void) { }
-static inline void sched_domains_mutex_unlock(void) { }
-#endif
 
 struct sched_param {
 	int sched_priority;
@@ -584,7 +580,15 @@ struct sched_entity {
 	u64				sum_exec_runtime;
 	u64				prev_sum_exec_runtime;
 	u64				vruntime;
-	s64				vlag;
+	union {
+		/*
+		 * When !@on_rq this field is vlag.
+		 * When cfs_rq->curr == se (which implies @on_rq)
+		 * this field is vprot. See protect_slice().
+		 */
+		s64                     vlag;
+		u64                     vprot;
+	};
 	u64				slice;
 
 	u64				nr_migrations;
@@ -600,7 +604,6 @@ struct sched_entity {
 	unsigned long			runnable_weight;
 #endif
 
-#ifdef CONFIG_SMP
 	/*
 	 * Per entity load average tracking.
 	 *
@@ -608,7 +611,6 @@ struct sched_entity {
 	 * collide with read-mostly values above.
 	 */
 	struct sched_avg		avg;
-#endif
 };
 
 struct sched_rt_entity {
@@ -701,6 +703,7 @@ struct sched_dl_entity {
 	unsigned int			dl_defer	  : 1;
 	unsigned int			dl_defer_armed	  : 1;
 	unsigned int			dl_defer_running  : 1;
+	unsigned int			dl_server_idle    : 1;
 
 	/*
 	 * Bandwidth enforcement timer. Each -deadline task has its
@@ -838,7 +841,6 @@ struct task_struct {
 	struct alloc_tag		*alloc_tag;
 #endif
 
-#ifdef CONFIG_SMP
 	int				on_cpu;
 	struct __call_single_node	wake_entry;
 	unsigned int			wakee_flips;
@@ -854,7 +856,6 @@ struct task_struct {
 	 */
 	int				recent_used_cpu;
 	int				wake_cpu;
-#endif
 	int				on_rq;
 
 	int				prio;
@@ -913,9 +914,7 @@ struct task_struct {
 	cpumask_t			*user_cpus_ptr;
 	cpumask_t			cpus_mask;
 	void				*migration_pending;
-#ifdef CONFIG_SMP
 	unsigned short			migration_disabled;
-#endif
 	unsigned short			migration_flags;
 
 #ifdef CONFIG_PREEMPT_RCU
@@ -947,10 +946,8 @@ struct task_struct {
 	struct sched_info		sched_info;
 
 	struct list_head		tasks;
-#ifdef CONFIG_SMP
 	struct plist_node		pushable_tasks;
 	struct rb_node			pushable_dl_tasks;
-#endif
 
 	struct mm_struct		*mm;
 	struct mm_struct		*active_mm;
@@ -1234,10 +1231,7 @@ struct task_struct {
 	struct rt_mutex_waiter		*pi_blocked_on;
 #endif
 
-#ifdef CONFIG_DEBUG_MUTEXES
-	/* Mutex deadlock detection: */
-	struct mutex_waiter		*blocked_on;
-#endif
+	struct mutex			*blocked_on;	/* lock we're blocked on */
 
 #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
 	/*
@@ -1662,6 +1656,19 @@ struct task_struct {
 	randomized_struct_fields_end
 } __attribute__ ((aligned (64)));
 
+#ifdef CONFIG_SCHED_PROXY_EXEC
+DECLARE_STATIC_KEY_TRUE(__sched_proxy_exec);
+static inline bool sched_proxy_exec(void)
+{
+	return static_branch_likely(&__sched_proxy_exec);
+}
+#else
+static inline bool sched_proxy_exec(void)
+{
+	return false;
+}
+#endif
+
 #define TASK_REPORT_IDLE	(TASK_REPORT + 1)
 #define TASK_REPORT_MAX		(TASK_REPORT_IDLE << 1)
 
@@ -1776,12 +1783,8 @@ extern struct pid *cad_pid;
 
 static __always_inline bool is_percpu_thread(void)
 {
-#ifdef CONFIG_SMP
 	return (current->flags & PF_NO_SETAFFINITY) &&
 		(current->nr_cpus_allowed  == 1);
-#else
-	return true;
-#endif
 }
 
 /* Per-process atomic flags. */
@@ -1846,7 +1849,6 @@ extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpu
 extern int task_can_attach(struct task_struct *p);
 extern int dl_bw_alloc(int cpu, u64 dl_bw);
 extern void dl_bw_free(int cpu, u64 dl_bw);
-#ifdef CONFIG_SMP
 
 /* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */
 extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
@@ -1864,33 +1866,6 @@ extern void release_user_cpus_ptr(struct task_struct *p);
 extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask);
 extern void force_compatible_cpus_allowed_ptr(struct task_struct *p);
 extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p);
-#else
-static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
-{
-}
-static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
-{
-	/* Opencoded cpumask_test_cpu(0, new_mask) to avoid dependency on cpumask.h */
-	if ((*cpumask_bits(new_mask) & 1) == 0)
-		return -EINVAL;
-	return 0;
-}
-static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node)
-{
-	if (src->user_cpus_ptr)
-		return -EINVAL;
-	return 0;
-}
-static inline void release_user_cpus_ptr(struct task_struct *p)
-{
-	WARN_ON(p->user_cpus_ptr);
-}
-
-static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
-{
-	return 0;
-}
-#endif
 
 extern int yield_to(struct task_struct *p, bool preempt);
 extern void set_user_nice(struct task_struct *p, long nice);
@@ -1979,11 +1954,7 @@ extern int wake_up_state(struct task_struct *tsk, unsigned int state);
 extern int wake_up_process(struct task_struct *tsk);
 extern void wake_up_new_task(struct task_struct *tsk);
 
-#ifdef CONFIG_SMP
 extern void kick_process(struct task_struct *tsk);
-#else
-static inline void kick_process(struct task_struct *tsk) { }
-#endif
 
 extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);
 #define set_task_comm(tsk, from) ({			\
@@ -2010,7 +1981,6 @@ extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec
 	buf;						\
 })
 
-#ifdef CONFIG_SMP
 static __always_inline void scheduler_ipi(void)
 {
 	/*
@@ -2020,9 +1990,6 @@ static __always_inline void scheduler_ipi(void)
 	 */
 	preempt_fold_need_resched();
 }
-#else
-static inline void scheduler_ipi(void) { }
-#endif
 
 extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state);
 
@@ -2165,6 +2132,67 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock);
 	__cond_resched_rwlock_write(lock);					\
 })
 
+#ifndef CONFIG_PREEMPT_RT
+static inline struct mutex *__get_task_blocked_on(struct task_struct *p)
+{
+	struct mutex *m = p->blocked_on;
+
+	if (m)
+		lockdep_assert_held_once(&m->wait_lock);
+	return m;
+}
+
+static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m)
+{
+	WARN_ON_ONCE(!m);
+	/* The task should only be setting itself as blocked */
+	WARN_ON_ONCE(p != current);
+	/* Currently we serialize blocked_on under the mutex::wait_lock */
+	lockdep_assert_held_once(&m->wait_lock);
+	/*
+	 * Check ensure we don't overwrite existing mutex value
+	 * with a different mutex. Note, setting it to the same
+	 * lock repeatedly is ok.
+	 */
+	WARN_ON_ONCE(p->blocked_on && p->blocked_on != m);
+	p->blocked_on = m;
+}
+
+static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m)
+{
+	guard(raw_spinlock_irqsave)(&m->wait_lock);
+	__set_task_blocked_on(p, m);
+}
+
+static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m)
+{
+	WARN_ON_ONCE(!m);
+	/* Currently we serialize blocked_on under the mutex::wait_lock */
+	lockdep_assert_held_once(&m->wait_lock);
+	/*
+	 * There may be cases where we re-clear already cleared
+	 * blocked_on relationships, but make sure we are not
+	 * clearing the relationship with a different lock.
+	 */
+	WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m);
+	p->blocked_on = NULL;
+}
+
+static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m)
+{
+	guard(raw_spinlock_irqsave)(&m->wait_lock);
+	__clear_task_blocked_on(p, m);
+}
+#else
+static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
+{
+}
+
+static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
+{
+}
+#endif /* !CONFIG_PREEMPT_RT */
+
 static __always_inline bool need_resched(void)
 {
 	return unlikely(tif_need_resched());
@@ -2204,8 +2232,6 @@ extern bool sched_task_on_rq(struct task_struct *p);
 extern unsigned long get_wchan(struct task_struct *p);
 extern struct task_struct *cpu_curr_snapshot(int cpu);
 
-#include <linux/spinlock.h>
-
 /*
  * In order to reduce various lock holder preemption latencies provide an
  * interface to see if a vCPU is currently running or not.
@@ -2228,7 +2254,6 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 #define TASK_SIZE_OF(tsk)	TASK_SIZE
 #endif
 
-#ifdef CONFIG_SMP
 static inline bool owner_on_cpu(struct task_struct *owner)
 {
 	/*
@@ -2240,7 +2265,6 @@ static inline bool owner_on_cpu(struct task_struct *owner)
 
 /* Returns effective CPU energy utilization, as seen by the scheduler */
 unsigned long sched_cpu_util(int cpu);
-#endif /* CONFIG_SMP */
 
 #ifdef CONFIG_SCHED_CORE
 extern void sched_core_free(struct task_struct *tsk);
author	Linus Torvalds <torvalds@linux-foundation.org>	2025-07-29 17:42:52 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2025-07-29 17:42:52 -0700
commit	bf76f23aa1c178e9115eba17f699fa726aed669b (patch)
tree	a88270238eae8a0e6d98225c7de4b06bc2d2fb37 /include/linux/sched.h
parent	14bed9bc81bae64db98349319f367bfc7dab0afd (diff)
parent	1b5f1454091e9e9fb5c944b3161acf4ec0894d0d (diff)