diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-07-29 17:42:52 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-07-29 17:42:52 -0700 |
commit | bf76f23aa1c178e9115eba17f699fa726aed669b (patch) | |
tree | a88270238eae8a0e6d98225c7de4b06bc2d2fb37 /include/linux/sched.h | |
parent | 14bed9bc81bae64db98349319f367bfc7dab0afd (diff) | |
parent | 1b5f1454091e9e9fb5c944b3161acf4ec0894d0d (diff) |
Merge tag 'sched-core-2025-07-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"Core scheduler changes:
- Better tracking of maximum lag of tasks in presence of different
slices duration, for better handling of lag in the fair scheduler
(Vincent Guittot)
- Clean up and standardize #if/#else/#endif markers throughout the
entire scheduler code base (Ingo Molnar)
- Make SMP unconditional: build the SMP scheduler's data structures
and logic on UP kernel too, even though they are not used, to
simplify the scheduler and remove around 200 #ifdef/[#else]/#endif
blocks from the scheduler (Ingo Molnar)
- Reorganize cgroup bandwidth control interface handling for better
interfacing with sched_ext (Tejun Heo)
Balancing:
- Bump sd->max_newidle_lb_cost when newidle balance fails (Chris
Mason)
- Remove sched_domain_topology_level::flags to simplify the code
(Prateek Nayak)
- Simplify and clean up build_sched_topology() (Li Chen)
- Optimize build_sched_topology() on large machines (Li Chen)
Real-time scheduling:
- Add initial version of proxy execution: a mechanism for
mutex-owning tasks to inherit the scheduling context of higher
priority waiters.
Currently limited to a single runqueue and conditional on
CONFIG_EXPERT, and other limitations (John Stultz, Peter Zijlstra,
Valentin Schneider)
- Deadline scheduler (Juri Lelli):
- Fix dl_servers initialization order (Juri Lelli)
- Fix DL scheduler's root domain reinitialization logic (Juri
Lelli)
- Fix accounting bugs after global limits change (Juri Lelli)
- Fix scalability regression by implementing less agressive
dl_server handling (Peter Zijlstra)
PSI:
- Improve scalability by optimizing psi_group_change() cpu_clock()
usage (Peter Zijlstra)
Rust changes:
- Make Task, CondVar and PollCondVar methods inline to avoid
unnecessary function calls (Kunwu Chan, Panagiotis Foliadis)
- Add might_sleep() support for Rust code: Rust's "#[track_caller]"
mechanism is used so that Rust's might_sleep() doesn't need to be
defined as a macro (Fujita Tomonori)
- Introduce file_from_location() (Boqun Feng)
Debugging & instrumentation:
- Make clangd usable with scheduler source code files again (Peter
Zijlstra)
- tools: Add root_domains_dump.py which dumps root domains info (Juri
Lelli)
- tools: Add dl_bw_dump.py for printing bandwidth accounting info
(Juri Lelli)
Misc cleanups & fixes:
- Remove play_idle() (Feng Lee)
- Fix check_preemption_disabled() (Sebastian Andrzej Siewior)
- Do not call __put_task_struct() on RT if pi_blocked_on is set (Luis
Claudio R. Goncalves)
- Correct the comment in place_entity() (wang wei)"
* tag 'sched-core-2025-07-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (84 commits)
sched/idle: Remove play_idle()
sched: Do not call __put_task_struct() on rt if pi_blocked_on is set
sched: Start blocked_on chain processing in find_proxy_task()
sched: Fix proxy/current (push,pull)ability
sched: Add an initial sketch of the find_proxy_task() function
sched: Fix runtime accounting w/ split exec & sched contexts
sched: Move update_curr_task logic into update_curr_se
locking/mutex: Add p->blocked_on wrappers for correctness checks
locking/mutex: Rework task_struct::blocked_on
sched: Add CONFIG_SCHED_PROXY_EXEC & boot argument to enable/disable
sched/topology: Remove sched_domain_topology_level::flags
x86/smpboot: avoid SMT domain attach/destroy if SMT is not enabled
x86/smpboot: moves x86_topology to static initialize and truncate
x86/smpboot: remove redundant CONFIG_SCHED_SMT
smpboot: introduce SDTL_INIT() helper to tidy sched topology setup
tools/sched: Add dl_bw_dump.py for printing bandwidth accounting info
tools/sched: Add root_domains_dump.py which dumps root domains info
sched/deadline: Fix accounting after global limits change
sched/deadline: Reset extra_bw to max_bw when clearing root domains
sched/deadline: Initialize dl_servers after SMP
...
Diffstat (limited to 'include/linux/sched.h')
-rw-r--r-- | include/linux/sched.h | 148 |
1 files changed, 86 insertions, 62 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index 63a9b6e23b07..40d2fa90df42 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -34,6 +34,7 @@ #include <linux/sched/prio.h> #include <linux/sched/types.h> #include <linux/signal_types.h> +#include <linux/spinlock.h> #include <linux/syscall_user_dispatch_types.h> #include <linux/mm_types_task.h> #include <linux/netdevice_xmit.h> @@ -395,15 +396,10 @@ enum uclamp_id { UCLAMP_CNT }; -#ifdef CONFIG_SMP extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; extern void sched_domains_mutex_lock(void); extern void sched_domains_mutex_unlock(void); -#else -static inline void sched_domains_mutex_lock(void) { } -static inline void sched_domains_mutex_unlock(void) { } -#endif struct sched_param { int sched_priority; @@ -584,7 +580,15 @@ struct sched_entity { u64 sum_exec_runtime; u64 prev_sum_exec_runtime; u64 vruntime; - s64 vlag; + union { + /* + * When !@on_rq this field is vlag. + * When cfs_rq->curr == se (which implies @on_rq) + * this field is vprot. See protect_slice(). + */ + s64 vlag; + u64 vprot; + }; u64 slice; u64 nr_migrations; @@ -600,7 +604,6 @@ struct sched_entity { unsigned long runnable_weight; #endif -#ifdef CONFIG_SMP /* * Per entity load average tracking. * @@ -608,7 +611,6 @@ struct sched_entity { * collide with read-mostly values above. */ struct sched_avg avg; -#endif }; struct sched_rt_entity { @@ -701,6 +703,7 @@ struct sched_dl_entity { unsigned int dl_defer : 1; unsigned int dl_defer_armed : 1; unsigned int dl_defer_running : 1; + unsigned int dl_server_idle : 1; /* * Bandwidth enforcement timer. Each -deadline task has its @@ -838,7 +841,6 @@ struct task_struct { struct alloc_tag *alloc_tag; #endif -#ifdef CONFIG_SMP int on_cpu; struct __call_single_node wake_entry; unsigned int wakee_flips; @@ -854,7 +856,6 @@ struct task_struct { */ int recent_used_cpu; int wake_cpu; -#endif int on_rq; int prio; @@ -913,9 +914,7 @@ struct task_struct { cpumask_t *user_cpus_ptr; cpumask_t cpus_mask; void *migration_pending; -#ifdef CONFIG_SMP unsigned short migration_disabled; -#endif unsigned short migration_flags; #ifdef CONFIG_PREEMPT_RCU @@ -947,10 +946,8 @@ struct task_struct { struct sched_info sched_info; struct list_head tasks; -#ifdef CONFIG_SMP struct plist_node pushable_tasks; struct rb_node pushable_dl_tasks; -#endif struct mm_struct *mm; struct mm_struct *active_mm; @@ -1234,10 +1231,7 @@ struct task_struct { struct rt_mutex_waiter *pi_blocked_on; #endif -#ifdef CONFIG_DEBUG_MUTEXES - /* Mutex deadlock detection: */ - struct mutex_waiter *blocked_on; -#endif + struct mutex *blocked_on; /* lock we're blocked on */ #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER /* @@ -1662,6 +1656,19 @@ struct task_struct { randomized_struct_fields_end } __attribute__ ((aligned (64))); +#ifdef CONFIG_SCHED_PROXY_EXEC +DECLARE_STATIC_KEY_TRUE(__sched_proxy_exec); +static inline bool sched_proxy_exec(void) +{ + return static_branch_likely(&__sched_proxy_exec); +} +#else +static inline bool sched_proxy_exec(void) +{ + return false; +} +#endif + #define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) @@ -1776,12 +1783,8 @@ extern struct pid *cad_pid; static __always_inline bool is_percpu_thread(void) { -#ifdef CONFIG_SMP return (current->flags & PF_NO_SETAFFINITY) && (current->nr_cpus_allowed == 1); -#else - return true; -#endif } /* Per-process atomic flags. */ @@ -1846,7 +1849,6 @@ extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpu extern int task_can_attach(struct task_struct *p); extern int dl_bw_alloc(int cpu, u64 dl_bw); extern void dl_bw_free(int cpu, u64 dl_bw); -#ifdef CONFIG_SMP /* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */ extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); @@ -1864,33 +1866,6 @@ extern void release_user_cpus_ptr(struct task_struct *p); extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask); extern void force_compatible_cpus_allowed_ptr(struct task_struct *p); extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p); -#else -static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -{ -} -static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -{ - /* Opencoded cpumask_test_cpu(0, new_mask) to avoid dependency on cpumask.h */ - if ((*cpumask_bits(new_mask) & 1) == 0) - return -EINVAL; - return 0; -} -static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node) -{ - if (src->user_cpus_ptr) - return -EINVAL; - return 0; -} -static inline void release_user_cpus_ptr(struct task_struct *p) -{ - WARN_ON(p->user_cpus_ptr); -} - -static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) -{ - return 0; -} -#endif extern int yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); @@ -1979,11 +1954,7 @@ extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); extern void wake_up_new_task(struct task_struct *tsk); -#ifdef CONFIG_SMP extern void kick_process(struct task_struct *tsk); -#else -static inline void kick_process(struct task_struct *tsk) { } -#endif extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); #define set_task_comm(tsk, from) ({ \ @@ -2010,7 +1981,6 @@ extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec buf; \ }) -#ifdef CONFIG_SMP static __always_inline void scheduler_ipi(void) { /* @@ -2020,9 +1990,6 @@ static __always_inline void scheduler_ipi(void) */ preempt_fold_need_resched(); } -#else -static inline void scheduler_ipi(void) { } -#endif extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); @@ -2165,6 +2132,67 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock); __cond_resched_rwlock_write(lock); \ }) +#ifndef CONFIG_PREEMPT_RT +static inline struct mutex *__get_task_blocked_on(struct task_struct *p) +{ + struct mutex *m = p->blocked_on; + + if (m) + lockdep_assert_held_once(&m->wait_lock); + return m; +} + +static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) +{ + WARN_ON_ONCE(!m); + /* The task should only be setting itself as blocked */ + WARN_ON_ONCE(p != current); + /* Currently we serialize blocked_on under the mutex::wait_lock */ + lockdep_assert_held_once(&m->wait_lock); + /* + * Check ensure we don't overwrite existing mutex value + * with a different mutex. Note, setting it to the same + * lock repeatedly is ok. + */ + WARN_ON_ONCE(p->blocked_on && p->blocked_on != m); + p->blocked_on = m; +} + +static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m) +{ + guard(raw_spinlock_irqsave)(&m->wait_lock); + __set_task_blocked_on(p, m); +} + +static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m) +{ + WARN_ON_ONCE(!m); + /* Currently we serialize blocked_on under the mutex::wait_lock */ + lockdep_assert_held_once(&m->wait_lock); + /* + * There may be cases where we re-clear already cleared + * blocked_on relationships, but make sure we are not + * clearing the relationship with a different lock. + */ + WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m); + p->blocked_on = NULL; +} + +static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) +{ + guard(raw_spinlock_irqsave)(&m->wait_lock); + __clear_task_blocked_on(p, m); +} +#else +static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) +{ +} + +static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) +{ +} +#endif /* !CONFIG_PREEMPT_RT */ + static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); @@ -2204,8 +2232,6 @@ extern bool sched_task_on_rq(struct task_struct *p); extern unsigned long get_wchan(struct task_struct *p); extern struct task_struct *cpu_curr_snapshot(int cpu); -#include <linux/spinlock.h> - /* * In order to reduce various lock holder preemption latencies provide an * interface to see if a vCPU is currently running or not. @@ -2228,7 +2254,6 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask); #define TASK_SIZE_OF(tsk) TASK_SIZE #endif -#ifdef CONFIG_SMP static inline bool owner_on_cpu(struct task_struct *owner) { /* @@ -2240,7 +2265,6 @@ static inline bool owner_on_cpu(struct task_struct *owner) /* Returns effective CPU energy utilization, as seen by the scheduler */ unsigned long sched_cpu_util(int cpu); -#endif /* CONFIG_SMP */ #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); |