diff options
Diffstat (limited to 'kernel/sched/sched.h')
-rw-r--r-- | kernel/sched/sched.h | 1190 |
1 files changed, 847 insertions, 343 deletions
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ae50f212775e..475bb5998295 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -68,12 +68,19 @@ #include <linux/wait_api.h> #include <linux/wait_bit.h> #include <linux/workqueue_api.h> +#include <linux/delayacct.h> #include <trace/events/power.h> #include <trace/events/sched.h> #include "../workqueue_internal.h" +struct rq; +struct cfs_rq; +struct rt_rq; +struct sched_group; +struct cpuidle_state; + #ifdef CONFIG_PARAVIRT # include <asm/paravirt.h> # include <asm/paravirt_api_clock.h> @@ -84,15 +91,6 @@ #include "cpupri.h" #include "cpudeadline.h" -#ifdef CONFIG_SCHED_DEBUG -# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) -#else -# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) -#endif - -struct rq; -struct cpuidle_state; - /* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 #define TASK_ON_RQ_MIGRATING 2 @@ -112,14 +110,28 @@ extern int sysctl_sched_rt_runtime; extern int sched_rr_timeslice; /* + * Asymmetric CPU capacity bits + */ +struct asym_cap_data { + struct list_head link; + struct rcu_head rcu; + unsigned long capacity; + unsigned long cpus[]; +}; + +extern struct list_head asym_cap_list; + +#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus) + +/* * Helpers for converting nanosecond timing to jiffy resolution */ -#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) +#define NS_TO_JIFFIES(time) ((unsigned long)(time) / (NSEC_PER_SEC/HZ)) /* * Increase resolution of nice-level calculations for 64-bit architectures. * The extra resolution improves shares distribution and load balancing of - * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup + * low-weight task groups (eg. nice +19 on an autogroup), deeper task-group * hierarchies, especially on larger systems. This is not a user-visible change * and does not change the user-interface for setting shares/weights. * @@ -133,12 +145,13 @@ extern int sched_rr_timeslice; #ifdef CONFIG_64BIT # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) # define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT) -# define scale_load_down(w) \ -({ \ - unsigned long __w = (w); \ - if (__w) \ - __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \ - __w; \ +# define scale_load_down(w) \ +({ \ + unsigned long __w = (w); \ + \ + if (__w) \ + __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \ + __w; \ }) #else # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT) @@ -173,9 +186,19 @@ static inline int idle_policy(int policy) { return policy == SCHED_IDLE; } + +static inline int normal_policy(int policy) +{ +#ifdef CONFIG_SCHED_CLASS_EXT + if (policy == SCHED_EXT) + return true; +#endif + return policy == SCHED_NORMAL; +} + static inline int fair_policy(int policy) { - return policy == SCHED_NORMAL || policy == SCHED_BATCH; + return normal_policy(policy) || policy == SCHED_BATCH; } static inline int rt_policy(int policy) @@ -187,6 +210,7 @@ static inline int dl_policy(int policy) { return policy == SCHED_DEADLINE; } + static inline bool valid_policy(int policy) { return idle_policy(policy) || fair_policy(policy) || @@ -208,11 +232,12 @@ static inline int task_has_dl_policy(struct task_struct *p) return dl_policy(p->policy); } -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) static inline void update_avg(u64 *avg, u64 sample) { s64 diff = sample - *avg; + *avg += diff / 8; } @@ -224,6 +249,24 @@ static inline void update_avg(u64 *avg, u64 sample) (val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1)) /* + * cgroup weight knobs should use the common MIN, DFL and MAX values which are + * 1, 100 and 10000 respectively. While it loses a bit of range on both ends, it + * maps pretty well onto the shares value used by scheduler and the round-trip + * conversions preserve the original value over the entire range. + */ +static inline unsigned long sched_weight_from_cgroup(unsigned long cgrp_weight) +{ + return DIV_ROUND_CLOSEST_ULL(cgrp_weight * 1024, CGROUP_WEIGHT_DFL); +} + +static inline unsigned long sched_weight_to_cgroup(unsigned long weight) +{ + return clamp_t(unsigned long, + DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024), + CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX); +} + +/* * !! For sched_setattr_nocheck() (kernel) only !! * * This is actually gross. :( @@ -237,7 +280,7 @@ static inline void update_avg(u64 *avg, u64 sample) */ #define SCHED_FLAG_SUGOV 0x10000000 -#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) +#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) static inline bool dl_entity_is_special(const struct sched_dl_entity *dl_se) { @@ -313,8 +356,8 @@ extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); extern bool __checkparam_dl(const struct sched_attr *attr); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); -extern int dl_bw_check_overflow(int cpu); - +extern int dl_bw_deactivate(int cpu); +extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec); /* * SCHED_DEADLINE supports servers (nested scheduling) with the following * interface: @@ -340,12 +383,21 @@ extern void dl_server_start(struct sched_dl_entity *dl_se); extern void dl_server_stop(struct sched_dl_entity *dl_se); extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_has_tasks_f has_tasks, - dl_server_pick_f pick); + dl_server_pick_f pick_task); -#ifdef CONFIG_CGROUP_SCHED +extern void dl_server_update_idle_time(struct rq *rq, + struct task_struct *p); +extern void fair_server_init(struct rq *rq); +extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq); +extern int dl_server_apply_params(struct sched_dl_entity *dl_se, + u64 runtime, u64 period, bool init); -struct cfs_rq; -struct rt_rq; +static inline bool dl_server_active(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_server_active; +} + +#ifdef CONFIG_CGROUP_SCHED extern struct list_head task_groups; @@ -379,20 +431,21 @@ struct cfs_bandwidth { struct task_group { struct cgroup_subsys_state css; +#ifdef CONFIG_GROUP_SCHED_WEIGHT + /* A positive value indicates that this is a SCHED_IDLE group. */ + int idle; +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED /* schedulable entities of this group on each CPU */ struct sched_entity **se; /* runqueue "owned" by this group on each CPU */ struct cfs_rq **cfs_rq; unsigned long shares; - - /* A positive value indicates that this is a SCHED_IDLE group. */ - int idle; - #ifdef CONFIG_SMP /* * load_avg can be heavily contended at clock tick time, so put - * it in its own cacheline separated from the fields above which + * it in its own cache-line separated from the fields above which * will also be accessed at each tick. */ atomic_long_t load_avg ____cacheline_aligned; @@ -406,6 +459,11 @@ struct task_group { struct rt_bandwidth rt_bandwidth; #endif +#ifdef CONFIG_EXT_GROUP_SCHED + u32 scx_flags; /* SCX_TG_* */ + u32 scx_weight; +#endif + struct rcu_head rcu; struct list_head list; @@ -430,7 +488,7 @@ struct task_group { }; -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED_WEIGHT #define ROOT_TASK_GROUP_LOAD NICE_0_LOAD /* @@ -461,6 +519,11 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) return walk_tg_tree_from(&root_task_group, down, up, data); } +static inline struct task_group *css_tg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct task_group, css) : NULL; +} + extern int tg_nop(struct task_group *tg, void *data); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -503,7 +566,7 @@ extern void sched_online_group(struct task_group *tg, extern void sched_destroy_group(struct task_group *tg); extern void sched_release_group(struct task_group *tg); -extern void sched_move_task(struct task_struct *tsk); +extern void sched_move_task(struct task_struct *tsk, bool for_autogroup); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); @@ -517,11 +580,15 @@ extern void set_task_rq_fair(struct sched_entity *se, static inline void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next) { } #endif /* CONFIG_SMP */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ +static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; } +static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; } #endif /* CONFIG_FAIR_GROUP_SCHED */ #else /* CONFIG_CGROUP_SCHED */ struct cfs_bandwidth { }; + static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; } #endif /* CONFIG_CGROUP_SCHED */ @@ -537,8 +604,8 @@ extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent * applicable for 32-bits architectures. */ #ifdef CONFIG_64BIT -# define u64_u32_load_copy(var, copy) var -# define u64_u32_store_copy(var, copy, val) (var = val) +# define u64_u32_load_copy(var, copy) var +# define u64_u32_store_copy(var, copy, val) (var = val) #else # define u64_u32_load_copy(var, copy) \ ({ \ @@ -566,31 +633,31 @@ do { \ copy = __val; \ } while (0) #endif -# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy) -# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val) +# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy) +# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val) + +struct balance_callback { + struct balance_callback *next; + void (*func)(struct rq *rq); +}; /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; - unsigned int nr_running; - unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ - unsigned int idle_nr_running; /* SCHED_IDLE */ - unsigned int idle_h_nr_running; /* SCHED_IDLE */ + unsigned int nr_queued; + unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_idle; /* SCHED_IDLE */ s64 avg_vruntime; u64 avg_load; - u64 exec_clock; u64 min_vruntime; #ifdef CONFIG_SCHED_CORE unsigned int forceidle_seq; u64 min_vruntime_fi; #endif -#ifndef CONFIG_64BIT - u64 min_vruntime_copy; -#endif - struct rb_root_cached tasks_timeline; /* @@ -600,10 +667,6 @@ struct cfs_rq { struct sched_entity *curr; struct sched_entity *next; -#ifdef CONFIG_SCHED_DEBUG - unsigned int nr_spread_over; -#endif - #ifdef CONFIG_SMP /* * CFS load tracking @@ -677,6 +740,47 @@ struct cfs_rq { #endif /* CONFIG_FAIR_GROUP_SCHED */ }; +#ifdef CONFIG_SCHED_CLASS_EXT +/* scx_rq->flags, protected by the rq lock */ +enum scx_rq_flags { + /* + * A hotplugged CPU starts scheduling before rq_online_scx(). Track + * ops.cpu_on/offline() state so that ops.enqueue/dispatch() are called + * only while the BPF scheduler considers the CPU to be online. + */ + SCX_RQ_ONLINE = 1 << 0, + SCX_RQ_CAN_STOP_TICK = 1 << 1, + SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */ + SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */ + SCX_RQ_BYPASSING = 1 << 4, + SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */ + + SCX_RQ_IN_WAKEUP = 1 << 16, + SCX_RQ_IN_BALANCE = 1 << 17, +}; + +struct scx_rq { + struct scx_dispatch_q local_dsq; + struct list_head runnable_list; /* runnable tasks on this rq */ + struct list_head ddsp_deferred_locals; /* deferred ddsps from enq */ + unsigned long ops_qseq; + u64 extra_enq_flags; /* see move_task_to_local_dsq() */ + u32 nr_running; + u32 cpuperf_target; /* [0, SCHED_CAPACITY_SCALE] */ + bool cpu_released; + u32 flags; + u64 clock; /* current per-rq clock -- see scx_bpf_now() */ + cpumask_var_t cpus_to_kick; + cpumask_var_t cpus_to_kick_if_idle; + cpumask_var_t cpus_to_preempt; + cpumask_var_t cpus_to_wait; + unsigned long pnt_seq; + struct balance_callback deferred_bal_cb; + struct irq_work deferred_irq_work; + struct irq_work kick_cpus_irq_work; +}; +#endif /* CONFIG_SCHED_CLASS_EXT */ + static inline int rt_bandwidth_enabled(void) { return sysctl_sched_rt_runtime >= 0; @@ -701,23 +805,25 @@ struct rt_rq { } highest_prio; #endif #ifdef CONFIG_SMP - int overloaded; + bool overloaded; struct plist_head pushable_tasks; #endif /* CONFIG_SMP */ int rt_queued; +#ifdef CONFIG_RT_GROUP_SCHED int rt_throttled; - u64 rt_time; - u64 rt_runtime; + u64 rt_time; /* consumed RT time, goes up in update_curr_rt */ + u64 rt_runtime; /* allotted RT time, "slice" from rt_bandwidth, RT sharing/balancing */ /* Nests inside the rq lock: */ raw_spinlock_t rt_runtime_lock; -#ifdef CONFIG_RT_GROUP_SCHED unsigned int rt_nr_boosted; - struct rq *rq; - struct task_group *tg; + struct rq *rq; /* this is always top-level rq, cache? */ +#endif +#ifdef CONFIG_CGROUP_SCHED + struct task_group *tg; /* this tg has "this" rt_rq on given CPU for runnable entities */ #endif }; @@ -745,7 +851,7 @@ struct dl_rq { u64 next; } earliest_dl; - int overloaded; + bool overloaded; /* * Tasks on this rq that can be pushed away. They are kept in @@ -789,33 +895,42 @@ struct dl_rq { }; #ifdef CONFIG_FAIR_GROUP_SCHED + /* An entity is a task if it doesn't "own" a runqueue */ #define entity_is_task(se) (!se->my_q) static inline void se_update_runnable(struct sched_entity *se) { if (!entity_is_task(se)) - se->runnable_weight = se->my_q->h_nr_running; + se->runnable_weight = se->my_q->h_nr_runnable; } static inline long se_runnable(struct sched_entity *se) { + if (se->sched_delayed) + return false; + if (entity_is_task(se)) return !!se->on_rq; else return se->runnable_weight; } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ + #define entity_is_task(se) 1 -static inline void se_update_runnable(struct sched_entity *se) {} +static inline void se_update_runnable(struct sched_entity *se) { } static inline long se_runnable(struct sched_entity *se) { + if (se->sched_delayed) + return false; + return !!se->on_rq; } -#endif + +#endif /* !CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP /* @@ -838,10 +953,6 @@ struct perf_domain { struct rcu_head rcu; }; -/* Scheduling group status flags */ -#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ -#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ - /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by @@ -862,10 +973,10 @@ struct root_domain { * - More than one runnable task * - Running task is misfit */ - int overload; + bool overloaded; - /* Indicate one or more cpus over-utilized (tipping point) */ - int overutilized; + /* Indicate one or more CPUs over-utilized (tipping point) */ + bool overutilized; /* * The bit corresponding to a CPU gets set here if such CPU has more @@ -883,7 +994,7 @@ struct root_domain { * Also, some corner cases, like 'wrap around' is dangerous, but given * that u64 is 'big enough'. So that shouldn't be a concern. */ - u64 visit_gen; + u64 visit_cookie; #ifdef HAVE_RT_PUSH_IPI /* @@ -905,8 +1016,6 @@ struct root_domain { cpumask_var_t rto_mask; struct cpupri cpupri; - unsigned long max_cpu_capacity; - /* * NULL-terminated list of performance domains intersecting with the * CPUs of the rd. Protected by RCU. @@ -920,6 +1029,17 @@ extern void rq_attach_root(struct rq *rq, struct root_domain *rd); extern void sched_get_rd(struct root_domain *rd); extern void sched_put_rd(struct root_domain *rd); +static inline int get_rd_overloaded(struct root_domain *rd) +{ + return READ_ONCE(rd->overloaded); +} + +static inline void set_rd_overloaded(struct root_domain *rd, int status) +{ + if (get_rd_overloaded(rd) != status) + WRITE_ONCE(rd->overloaded, status); +} + #ifdef HAVE_RT_PUSH_IPI extern void rto_push_irq_work_func(struct irq_work *work); #endif @@ -969,12 +1089,6 @@ struct uclamp_rq { DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); #endif /* CONFIG_UCLAMP_TASK */ -struct rq; -struct balance_callback { - struct balance_callback *next; - void (*func)(struct rq *rq); -}; - /* * This is the main, per-CPU runqueue data structure. * @@ -1017,6 +1131,11 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; struct dl_rq dl; +#ifdef CONFIG_SCHED_CLASS_EXT + struct scx_rq scx; +#endif + + struct sched_dl_entity fair_server; #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this CPU: */ @@ -1032,7 +1151,11 @@ struct rq { */ unsigned int nr_uninterruptible; - struct task_struct __rcu *curr; + union { + struct task_struct __rcu *donor; /* Scheduler context */ + struct task_struct __rcu *curr; /* Execution context */ + }; + struct sched_dl_entity *dl_server; struct task_struct *idle; struct task_struct *stop; unsigned long next_balance; @@ -1053,10 +1176,8 @@ struct rq { atomic_t nr_iowait; -#ifdef CONFIG_SCHED_DEBUG u64 last_seen_need_resched_ns; int ticks_without_resched; -#endif #ifdef CONFIG_MEMBARRIER int membarrier_state; @@ -1091,8 +1212,8 @@ struct rq { #ifdef CONFIG_HAVE_SCHED_AVG_IRQ struct sched_avg avg_irq; #endif -#ifdef CONFIG_SCHED_THERMAL_PRESSURE - struct sched_avg avg_thermal; +#ifdef CONFIG_SCHED_HW_PRESSURE + struct sched_avg avg_hw; #endif u64 idle_stamp; u64 avg_idle; @@ -1107,6 +1228,7 @@ struct rq { #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; + u64 psi_irq_time; #endif #ifdef CONFIG_PARAVIRT u64 prev_steal_time; @@ -1124,14 +1246,13 @@ struct rq { call_single_data_t hrtick_csd; #endif struct hrtimer hrtick_timer; - ktime_t hrtick_time; + ktime_t hrtick_time; #endif #ifdef CONFIG_SCHEDSTATS /* latency stats */ struct sched_info rq_sched_info; unsigned long long rq_cpu_time; - /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ /* sys_sched_yield() stats */ unsigned int yld_count; @@ -1146,7 +1267,7 @@ struct rq { #endif #ifdef CONFIG_CPU_IDLE - /* Must be inspected within a rcu lock section */ + /* Must be inspected within a RCU lock section */ struct cpuidle_state *idle_state; #endif @@ -1160,6 +1281,7 @@ struct rq { /* per rq */ struct rq *core; struct task_struct *core_pick; + struct sched_dl_entity *core_dl_server; unsigned int core_enabled; unsigned int core_sched_seq; struct rb_root core_tree; @@ -1208,7 +1330,7 @@ static inline int cpu_of(struct rq *rq) #endif } -#define MDF_PUSH 0x01 +#define MDF_PUSH 0x01 static inline bool is_migration_disabled(struct task_struct *p) { @@ -1227,7 +1349,11 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) -struct sched_group; +static inline void rq_set_donor(struct rq *rq, struct task_struct *t) +{ + /* Do nothing */ +} + #ifdef CONFIG_SCHED_CORE static inline struct cpumask *sched_group_span(struct sched_group *sg); @@ -1263,9 +1389,10 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq) return &rq->__lock; } -bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, - bool fi); -void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi); +extern bool +cfs_prio_less(const struct task_struct *a, const struct task_struct *b, bool fi); + +extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi); /* * Helpers to check if the CPU's core cookie matches with the task's cookie @@ -1333,7 +1460,7 @@ extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags); extern void sched_core_get(void); extern void sched_core_put(void); -#else /* !CONFIG_SCHED_CORE */ +#else /* !CONFIG_SCHED_CORE: */ static inline bool sched_core_enabled(struct rq *rq) { @@ -1371,7 +1498,25 @@ static inline bool sched_group_cookie_match(struct rq *rq, { return true; } -#endif /* CONFIG_SCHED_CORE */ + +#endif /* !CONFIG_SCHED_CORE */ +#ifdef CONFIG_RT_GROUP_SCHED +# ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED +DECLARE_STATIC_KEY_FALSE(rt_group_sched); +static inline bool rt_group_sched_enabled(void) +{ + return static_branch_unlikely(&rt_group_sched); +} +# else +DECLARE_STATIC_KEY_TRUE(rt_group_sched); +static inline bool rt_group_sched_enabled(void) +{ + return static_branch_likely(&rt_group_sched); +} +# endif /* CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */ +#else +# define rt_group_sched_enabled() false +#endif /* CONFIG_RT_GROUP_SCHED */ static inline void lockdep_assert_rq_held(struct rq *rq) { @@ -1402,8 +1547,10 @@ static inline void raw_spin_rq_unlock_irq(struct rq *rq) static inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq) { unsigned long flags; + local_irq_save(flags); raw_spin_rq_lock(rq); + return flags; } @@ -1432,9 +1579,10 @@ static inline void update_idle_core(struct rq *rq) { } #endif #ifdef CONFIG_FAIR_GROUP_SCHED + static inline struct task_struct *task_of(struct sched_entity *se) { - SCHED_WARN_ON(!entity_is_task(se)); + WARN_ON_ONCE(!entity_is_task(se)); return container_of(se, struct task_struct, se); } @@ -1455,9 +1603,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ -#define task_of(_se) container_of(_se, struct task_struct, se) +#define task_of(_se) container_of(_se, struct task_struct, se) static inline struct cfs_rq *task_cfs_rq(const struct task_struct *p) { @@ -1477,7 +1625,8 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) { return NULL; } -#endif + +#endif /* !CONFIG_FAIR_GROUP_SCHED */ extern void update_rq_clock(struct rq *rq); @@ -1514,7 +1663,7 @@ static inline void assert_clock_updated(struct rq *rq) * The only reason for not seeing a clock update since the * last rq_pin_lock() is if we're currently skipping updates. */ - SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP); + WARN_ON_ONCE(rq->clock_update_flags < RQCF_ACT_SKIP); } static inline u64 rq_clock(struct rq *rq) @@ -1533,24 +1682,6 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } -/** - * By default the decay is the default pelt decay period. - * The decay shift can change the decay period in - * multiples of 32. - * Decay shift Decay period(ms) - * 0 32 - * 1 64 - * 2 128 - * 3 256 - * 4 512 - */ -extern int sched_thermal_decay_shift; - -static inline u64 rq_clock_thermal(struct rq *rq) -{ - return rq_clock_task(rq) >> sched_thermal_decay_shift; -} - static inline void rq_clock_skip_update(struct rq *rq) { lockdep_assert_rq_held(rq); @@ -1579,7 +1710,7 @@ static inline void rq_clock_cancel_skipupdate(struct rq *rq) static inline void rq_clock_start_loop_update(struct rq *rq) { lockdep_assert_rq_held(rq); - SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP); + WARN_ON_ONCE(rq->clock_update_flags & RQCF_ACT_SKIP); rq->clock_update_flags |= RQCF_ACT_SKIP; } @@ -1592,18 +1723,48 @@ static inline void rq_clock_stop_loop_update(struct rq *rq) struct rq_flags { unsigned long flags; struct pin_cookie cookie; -#ifdef CONFIG_SCHED_DEBUG /* * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the * current pin context is stashed here in case it needs to be * restored in rq_repin_lock(). */ unsigned int clock_update_flags; -#endif }; extern struct balance_callback balance_push_callback; +#ifdef CONFIG_SCHED_CLASS_EXT +extern const struct sched_class ext_sched_class; + +DECLARE_STATIC_KEY_FALSE(__scx_enabled); /* SCX BPF scheduler loaded */ +DECLARE_STATIC_KEY_FALSE(__scx_switched_all); /* all fair class tasks on SCX */ + +#define scx_enabled() static_branch_unlikely(&__scx_enabled) +#define scx_switched_all() static_branch_unlikely(&__scx_switched_all) + +static inline void scx_rq_clock_update(struct rq *rq, u64 clock) +{ + if (!scx_enabled()) + return; + WRITE_ONCE(rq->scx.clock, clock); + smp_store_release(&rq->scx.flags, rq->scx.flags | SCX_RQ_CLK_VALID); +} + +static inline void scx_rq_clock_invalidate(struct rq *rq) +{ + if (!scx_enabled()) + return; + WRITE_ONCE(rq->scx.flags, rq->scx.flags & ~SCX_RQ_CLK_VALID); +} + +#else /* !CONFIG_SCHED_CLASS_EXT */ +#define scx_enabled() false +#define scx_switched_all() false + +static inline void scx_rq_clock_update(struct rq *rq, u64 clock) {} +static inline void scx_rq_clock_invalidate(struct rq *rq) {} +#endif /* !CONFIG_SCHED_CLASS_EXT */ + /* * Lockdep annotation that avoids accidental unlocks; it's like a * sticky/continuous lockdep_assert_held(). @@ -1618,22 +1779,19 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) { rf->cookie = lockdep_pin_lock(__rq_lockp(rq)); -#ifdef CONFIG_SCHED_DEBUG rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rf->clock_update_flags = 0; #ifdef CONFIG_SMP - SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != &balance_push_callback); -#endif + WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback); #endif } static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) { -#ifdef CONFIG_SCHED_DEBUG if (rq->clock_update_flags > RQCF_ACT_SKIP) rf->clock_update_flags = RQCF_UPDATED; -#endif + scx_rq_clock_invalidate(rq); lockdep_unpin_lock(__rq_lockp(rq), rf->cookie); } @@ -1641,17 +1799,17 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) { lockdep_repin_lock(__rq_lockp(rq), rf->cookie); -#ifdef CONFIG_SCHED_DEBUG /* * Restore the value we stashed in @rf for this pin context. */ rq->clock_update_flags |= rf->clock_update_flags; -#endif } +extern struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(rq->lock); +extern struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(p->pi_lock) __acquires(rq->lock); @@ -1678,48 +1836,42 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct, task_rq_unlock(_T->rq, _T->lock, &_T->rf), struct rq *rq; struct rq_flags rf) -static inline void -rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) +static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { raw_spin_rq_lock_irqsave(rq, rf->flags); rq_pin_lock(rq, rf); } -static inline void -rq_lock_irq(struct rq *rq, struct rq_flags *rf) +static inline void rq_lock_irq(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { raw_spin_rq_lock_irq(rq); rq_pin_lock(rq, rf); } -static inline void -rq_lock(struct rq *rq, struct rq_flags *rf) +static inline void rq_lock(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { raw_spin_rq_lock(rq); rq_pin_lock(rq, rf); } -static inline void -rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) +static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); raw_spin_rq_unlock_irqrestore(rq, rf->flags); } -static inline void -rq_unlock_irq(struct rq *rq, struct rq_flags *rf) +static inline void rq_unlock_irq(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); raw_spin_rq_unlock_irq(rq); } -static inline void -rq_unlock(struct rq *rq, struct rq_flags *rf) +static inline void rq_unlock(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); @@ -1741,8 +1893,7 @@ DEFINE_LOCK_GUARD_1(rq_lock_irqsave, struct rq, rq_unlock_irqrestore(_T->lock, &_T->rf), struct rq_flags rf) -static inline struct rq * -this_rq_lock_irq(struct rq_flags *rf) +static inline struct rq *this_rq_lock_irq(struct rq_flags *rf) __acquires(rq->lock) { struct rq *rq; @@ -1750,15 +1901,18 @@ this_rq_lock_irq(struct rq_flags *rf) local_irq_disable(); rq = this_rq(); rq_lock(rq, rf); + return rq; } #ifdef CONFIG_NUMA + enum numa_topology_type { NUMA_DIRECT, NUMA_GLUELESS_MESH, NUMA_BACKPLANE, }; + extern enum numa_topology_type sched_numa_topology_type; extern int sched_max_numa_distance; extern bool find_numa_distance(int distance); @@ -1767,18 +1921,23 @@ extern void sched_update_numa(int cpu, bool online); extern void sched_domains_numa_masks_set(unsigned int cpu); extern void sched_domains_numa_masks_clear(unsigned int cpu); extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); -#else + +#else /* !CONFIG_NUMA: */ + static inline void sched_init_numa(int offline_node) { } static inline void sched_update_numa(int cpu, bool online) { } static inline void sched_domains_numa_masks_set(unsigned int cpu) { } static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } + static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) { return nr_cpu_ids; } -#endif + +#endif /* !CONFIG_NUMA */ #ifdef CONFIG_NUMA_BALANCING + /* The regions in numa_faults array from task_struct */ enum numa_faults_stats { NUMA_MEM = 0, @@ -1786,17 +1945,21 @@ enum numa_faults_stats { NUMA_MEMBUF, NUMA_CPUBUF }; + extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *p, struct task_struct *t, int cpu, int scpu); extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); -#else + +#else /* !CONFIG_NUMA_BALANCING: */ + static inline void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) { } -#endif /* CONFIG_NUMA_BALANCING */ + +#endif /* !CONFIG_NUMA_BALANCING */ #ifdef CONFIG_SMP @@ -1821,8 +1984,7 @@ queue_balance_callback(struct rq *rq, } #define rcu_dereference_check_sched_domain(p) \ - rcu_dereference_check((p), \ - lockdep_is_held(&sched_domains_mutex)) + rcu_dereference_check((p), lockdep_is_held(&sched_domains_mutex)) /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. @@ -1893,6 +2055,7 @@ DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + extern struct static_key_false sched_asym_cpucapacity; extern struct static_key_false sched_cluster_active; @@ -1913,9 +2076,7 @@ struct sched_group_capacity { unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ -#ifdef CONFIG_SCHED_DEBUG int id; -#endif unsigned long cpumask[]; /* Balance mask */ }; @@ -1955,17 +2116,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg) extern int group_balance_cpu(struct sched_group *sg); -#ifdef CONFIG_SCHED_DEBUG -void update_sched_domain_debugfs(void); -void dirty_sched_domain_sysctl(int cpu); -#else -static inline void update_sched_domain_debugfs(void) -{ -} -static inline void dirty_sched_domain_sysctl(int cpu) -{ -} -#endif +extern void update_sched_domain_debugfs(void); +extern void dirty_sched_domain_sysctl(int cpu); extern int sched_update_scaling(void); @@ -1975,35 +2127,8 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p) return cpu_possible_mask; /* &init_task.cpus_mask */ return p->user_cpus_ptr; } -#endif /* CONFIG_SMP */ - -#include "stats.h" - -#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) - -extern void __sched_core_account_forceidle(struct rq *rq); - -static inline void sched_core_account_forceidle(struct rq *rq) -{ - if (schedstat_enabled()) - __sched_core_account_forceidle(rq); -} - -extern void __sched_core_tick(struct rq *rq); -static inline void sched_core_tick(struct rq *rq) -{ - if (sched_core_enabled(rq) && schedstat_enabled()) - __sched_core_tick(rq); -} - -#else - -static inline void sched_core_account_forceidle(struct rq *rq) {} - -static inline void sched_core_tick(struct rq *rq) {} - -#endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */ +#endif /* CONFIG_SMP */ #ifdef CONFIG_CGROUP_SCHED @@ -2040,20 +2165,28 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif #ifdef CONFIG_RT_GROUP_SCHED + /* + * p->rt.rt_rq is NULL initially and it is easier to assign + * root_task_group's rt_rq than switching in rt_rq_of_se() + * Clobbers tg(!) + */ + if (!rt_group_sched_enabled()) + tg = &root_task_group; p->rt.rt_rq = tg->rt_rq[cpu]; p->rt.parent = tg->rt_se[cpu]; #endif } -#else /* CONFIG_CGROUP_SCHED */ +#else /* !CONFIG_CGROUP_SCHED: */ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } + static inline struct task_group *task_group(struct task_struct *p) { return NULL; } -#endif /* CONFIG_CGROUP_SCHED */ +#endif /* !CONFIG_CGROUP_SCHED */ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { @@ -2071,13 +2204,8 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) } /* - * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + * Tunables: */ -#ifdef CONFIG_SCHED_DEBUG -# define const_debug __read_mostly -#else -# define const_debug const -#endif #define SCHED_FEAT(name, enabled) \ __SCHED_FEAT_##name , @@ -2089,15 +2217,14 @@ enum { #undef SCHED_FEAT -#ifdef CONFIG_SCHED_DEBUG - /* * To support run-time toggling of sched features, all the translation units * (but core.c) reference the sysctl_sched_features defined in core.c. */ -extern const_debug unsigned int sysctl_sched_features; +extern __read_mostly unsigned int sysctl_sched_features; #ifdef CONFIG_JUMP_LABEL + #define SCHED_FEAT(name, enabled) \ static __always_inline bool static_branch_##name(struct static_key *key) \ { \ @@ -2110,29 +2237,11 @@ static __always_inline bool static_branch_##name(struct static_key *key) \ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) -#else /* !CONFIG_JUMP_LABEL */ +#else /* !CONFIG_JUMP_LABEL: */ #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) -#endif /* CONFIG_JUMP_LABEL */ - -#else /* !SCHED_DEBUG */ - -/* - * Each translation unit has its own copy of sysctl_sched_features to allow - * constants propagation at compile time and compiler optimization based on - * features default. - */ -#define SCHED_FEAT(name, enabled) \ - (1UL << __SCHED_FEAT_##name) * enabled | -static const_debug __maybe_unused unsigned int sysctl_sched_features = -#include "features.h" - 0; -#undef SCHED_FEAT - -#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) - -#endif /* SCHED_DEBUG */ +#endif /* !CONFIG_JUMP_LABEL */ extern struct static_key_false sched_numa_balancing; extern struct static_key_false sched_schedstats; @@ -2150,11 +2259,25 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } +/* + * Is p the current execution context? + */ static inline int task_current(struct rq *rq, struct task_struct *p) { return rq->curr == p; } +/* + * Is p the current scheduling context? + * + * Note that it might be the current execution context at the same time if + * rq->curr == rq->donor == p. + */ +static inline int task_current_donor(struct rq *rq, struct task_struct *p) +{ + return rq->donor == p; +} + static inline int task_on_cpu(struct rq *rq, struct task_struct *p) { #ifdef CONFIG_SMP @@ -2166,7 +2289,7 @@ static inline int task_on_cpu(struct rq *rq, struct task_struct *p) static inline int task_on_rq_queued(struct task_struct *p) { - return p->on_rq == TASK_ON_RQ_QUEUED; + return READ_ONCE(p->on_rq) == TASK_ON_RQ_QUEUED; } static inline int task_on_rq_migrating(struct task_struct *p) @@ -2175,13 +2298,14 @@ static inline int task_on_rq_migrating(struct task_struct *p) } /* Wake flags. The first three directly map to some SD flag value */ -#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ -#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ -#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */ +#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ +#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ +#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */ -#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ -#define WF_MIGRATED 0x20 /* Internal use, task got migrated */ -#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ +#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ +#define WF_MIGRATED 0x20 /* Internal use, task got migrated */ +#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ +#define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */ #ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); @@ -2224,14 +2348,17 @@ extern const u32 sched_prio_to_wmult[40]; * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) * ENQUEUE_MIGRATED - the task was migrated during wakeup + * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called * */ -#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */ #define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ +#define DEQUEUE_SPECIAL 0x10 #define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ +#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 @@ -2247,13 +2374,15 @@ extern const u32 sched_prio_to_wmult[40]; #endif #define ENQUEUE_INITIAL 0x80 #define ENQUEUE_MIGRATING 0x100 +#define ENQUEUE_DELAYED 0x200 +#define ENQUEUE_RQ_SELECTED 0x400 #define RETRY_TASK ((void *)-1UL) struct affinity_context { - const struct cpumask *new_mask; - struct cpumask *user_mask; - unsigned int flags; + const struct cpumask *new_mask; + struct cpumask *user_mask; + unsigned int flags; }; extern s64 update_curr_common(struct rq *rq); @@ -2265,23 +2394,31 @@ struct sched_class { #endif void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); - void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); + bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); void (*yield_task) (struct rq *rq); bool (*yield_to_task)(struct rq *rq, struct task_struct *p); void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); - struct task_struct *(*pick_next_task)(struct rq *rq); + int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); + struct task_struct *(*pick_task)(struct rq *rq); + /* + * Optional! When implemented pick_next_task() should be equivalent to: + * + * next = pick_task(); + * if (next) { + * put_prev_task(prev); + * set_next_task_first(next); + * } + */ + struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev); - void (*put_prev_task)(struct rq *rq, struct task_struct *p); + void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next); void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); #ifdef CONFIG_SMP - int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); - struct task_struct * (*pick_task)(struct rq *rq); - void (*migrate_task_rq)(struct task_struct *p, int new_cpu); void (*task_woken)(struct rq *this_rq, struct task_struct *task); @@ -2303,8 +2440,11 @@ struct sched_class { * cannot assume the switched_from/switched_to pair is serialized by * rq->lock. They are however serialized by p->pi_lock. */ + void (*switching_to) (struct rq *this_rq, struct task_struct *task); void (*switched_from)(struct rq *this_rq, struct task_struct *task); void (*switched_to) (struct rq *this_rq, struct task_struct *task); + void (*reweight_task)(struct rq *this_rq, struct task_struct *task, + const struct load_weight *lw); void (*prio_changed) (struct rq *this_rq, struct task_struct *task, int oldprio); @@ -2324,8 +2464,8 @@ struct sched_class { static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { - WARN_ON_ONCE(rq->curr != prev); - prev->sched_class->put_prev_task(rq, prev); + WARN_ON_ONCE(rq->donor != prev); + prev->sched_class->put_prev_task(rq, prev, NULL); } static inline void set_next_task(struct rq *rq, struct task_struct *next) @@ -2333,6 +2473,30 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next) next->sched_class->set_next_task(rq, next, false); } +static inline void +__put_prev_set_next_dl_server(struct rq *rq, + struct task_struct *prev, + struct task_struct *next) +{ + prev->dl_server = NULL; + next->dl_server = rq->dl_server; + rq->dl_server = NULL; +} + +static inline void put_prev_set_next_task(struct rq *rq, + struct task_struct *prev, + struct task_struct *next) +{ + WARN_ON_ONCE(rq->curr != prev); + + __put_prev_set_next_dl_server(rq, prev, next); + + if (next == prev) + return; + + prev->sched_class->put_prev_task(rq, prev, next); + next->sched_class->set_next_task(rq, next, true); +} /* * Helper to define a sched_class instance; each one is placed in a separate @@ -2353,19 +2517,41 @@ const struct sched_class name##_sched_class \ extern struct sched_class __sched_class_highest[]; extern struct sched_class __sched_class_lowest[]; +extern const struct sched_class stop_sched_class; +extern const struct sched_class dl_sched_class; +extern const struct sched_class rt_sched_class; +extern const struct sched_class fair_sched_class; +extern const struct sched_class idle_sched_class; + +/* + * Iterate only active classes. SCX can take over all fair tasks or be + * completely disabled. If the former, skip fair. If the latter, skip SCX. + */ +static inline const struct sched_class *next_active_class(const struct sched_class *class) +{ + class++; +#ifdef CONFIG_SCHED_CLASS_EXT + if (scx_switched_all() && class == &fair_sched_class) + class++; + if (!scx_enabled() && class == &ext_sched_class) + class++; +#endif + return class; +} + #define for_class_range(class, _from, _to) \ for (class = (_from); class < (_to); class++) #define for_each_class(class) \ for_class_range(class, __sched_class_highest, __sched_class_lowest) -#define sched_class_above(_a, _b) ((_a) < (_b)) +#define for_active_class_range(class, _from, _to) \ + for (class = (_from); class != (_to); class = next_active_class(class)) -extern const struct sched_class stop_sched_class; -extern const struct sched_class dl_sched_class; -extern const struct sched_class rt_sched_class; -extern const struct sched_class fair_sched_class; -extern const struct sched_class idle_sched_class; +#define for_each_active_class(class) \ + for_active_class_range(class, __sched_class_highest, __sched_class_lowest) + +#define sched_class_above(_a, _b) ((_a) < (_b)) static inline bool sched_stop_runnable(struct rq *rq) { @@ -2384,11 +2570,11 @@ static inline bool sched_rt_runnable(struct rq *rq) static inline bool sched_fair_runnable(struct rq *rq) { - return rq->cfs.nr_running > 0; + return rq->cfs.nr_queued > 0; } extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); -extern struct task_struct *pick_next_task_idle(struct rq *rq); +extern struct task_struct *pick_task_idle(struct rq *rq); #define SCA_CHECK 0x01 #define SCA_MIGRATE_DISABLE 0x02 @@ -2399,13 +2585,37 @@ extern struct task_struct *pick_next_task_idle(struct rq *rq); extern void update_group_capacity(struct sched_domain *sd, int cpu); -extern void trigger_load_balance(struct rq *rq); +extern void sched_balance_trigger(struct rq *rq); +extern int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx); extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx); +static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) +{ + /* When not in the task's cpumask, no point in looking further. */ + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + return false; + + /* Can @cpu run a user thread? */ + if (!(p->flags & PF_KTHREAD) && !task_cpu_possible(cpu, p)) + return false; + + return true; +} + +static inline cpumask_t *alloc_user_cpus_ptr(int node) +{ + /* + * See do_set_cpus_allowed() above for the rcu_head usage. + */ + int size = max_t(int, cpumask_size(), sizeof(struct rcu_head)); + + return kmalloc_node(size, GFP_KERNEL, node); +} + static inline struct task_struct *get_push_task(struct rq *rq) { - struct task_struct *p = rq->curr; + struct task_struct *p = rq->donor; lockdep_assert_rq_held(rq); @@ -2424,9 +2634,28 @@ static inline struct task_struct *get_push_task(struct rq *rq) extern int push_cpu_stop(void *arg); -#endif +#else /* !CONFIG_SMP: */ + +static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) +{ + return true; +} + +static inline int __set_cpus_allowed_ptr(struct task_struct *p, + struct affinity_context *ctx) +{ + return set_cpus_allowed_ptr(p, ctx->new_mask); +} + +static inline cpumask_t *alloc_user_cpus_ptr(int node) +{ + return NULL; +} + +#endif /* !CONFIG_SMP */ #ifdef CONFIG_CPU_IDLE + static inline void idle_set_state(struct rq *rq, struct cpuidle_state *idle_state) { @@ -2435,11 +2664,13 @@ static inline void idle_set_state(struct rq *rq, static inline struct cpuidle_state *idle_get_state(struct rq *rq) { - SCHED_WARN_ON(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held()); return rq->idle_state; } -#else + +#else /* !CONFIG_CPU_IDLE: */ + static inline void idle_set_state(struct rq *rq, struct cpuidle_state *idle_state) { @@ -2449,7 +2680,8 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) { return NULL; } -#endif + +#endif /* !CONFIG_CPU_IDLE */ extern void schedule_idle(void); asmlinkage void schedule_user(void); @@ -2462,12 +2694,10 @@ extern void init_sched_dl_class(void); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); -extern void reweight_task(struct task_struct *p, int prio); - extern void resched_curr(struct rq *rq); +extern void resched_curr_lazy(struct rq *rq); extern void resched_cpu(int cpu); -extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); @@ -2478,7 +2708,8 @@ extern void init_dl_entity(struct sched_dl_entity *dl_se); #define RATIO_SHIFT 8 #define MAX_BW_BITS (64 - BW_SHIFT) #define MAX_BW ((1ULL << MAX_BW_BITS) - 1) -unsigned long to_ratio(u64 period, u64 runtime); + +extern unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); extern void post_init_entity_util_avg(struct task_struct *p); @@ -2504,10 +2735,10 @@ static inline void sched_update_tick_dependency(struct rq *rq) else tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); } -#else +#else /* !CONFIG_NO_HZ_FULL: */ static inline int sched_tick_offload_init(void) { return 0; } static inline void sched_update_tick_dependency(struct rq *rq) { } -#endif +#endif /* !CONFIG_NO_HZ_FULL */ static inline void add_nr_running(struct rq *rq, unsigned count) { @@ -2519,10 +2750,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count) } #ifdef CONFIG_SMP - if (prev_nr < 2 && rq->nr_running >= 2) { - if (!READ_ONCE(rq->rd->overload)) - WRITE_ONCE(rq->rd->overload, 1); - } + if (prev_nr < 2 && rq->nr_running >= 2) + set_rd_overloaded(rq->rd, 1); #endif sched_update_tick_dependency(rq); @@ -2539,23 +2768,65 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) sched_update_tick_dependency(rq); } +static inline void __block_task(struct rq *rq, struct task_struct *p) +{ + if (p->sched_contributes_to_load) + rq->nr_uninterruptible++; + + if (p->in_iowait) { + atomic_inc(&rq->nr_iowait); + delayacct_blkio_start(); + } + + ASSERT_EXCLUSIVE_WRITER(p->on_rq); + + /* + * The moment this write goes through, ttwu() can swoop in and migrate + * this task, rendering our rq->__lock ineffective. + * + * __schedule() try_to_wake_up() + * LOCK rq->__lock LOCK p->pi_lock + * pick_next_task() + * pick_next_task_fair() + * pick_next_entity() + * dequeue_entities() + * __block_task() + * RELEASE p->on_rq = 0 if (p->on_rq && ...) + * break; + * + * ACQUIRE (after ctrl-dep) + * + * cpu = select_task_rq(); + * set_task_cpu(p, cpu); + * ttwu_queue() + * ttwu_do_activate() + * LOCK rq->__lock + * activate_task() + * STORE p->on_rq = 1 + * UNLOCK rq->__lock + * + * Callers must ensure to not reference @p after this -- we no longer + * own it. + */ + smp_store_release(&p->on_rq, 0); +} + extern void activate_task(struct rq *rq, struct task_struct *p, int flags); extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); #ifdef CONFIG_PREEMPT_RT -#define SCHED_NR_MIGRATE_BREAK 8 +# define SCHED_NR_MIGRATE_BREAK 8 #else -#define SCHED_NR_MIGRATE_BREAK 32 +# define SCHED_NR_MIGRATE_BREAK 32 #endif -extern const_debug unsigned int sysctl_sched_nr_migrate; -extern const_debug unsigned int sysctl_sched_migration_cost; +extern __read_mostly unsigned int sysctl_sched_nr_migrate; +extern __read_mostly unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_base_slice; -#ifdef CONFIG_SCHED_DEBUG extern int sysctl_resched_latency_warn_ms; extern int sysctl_resched_latency_warn_once; @@ -2566,7 +2837,6 @@ extern unsigned int sysctl_numa_balancing_scan_period_min; extern unsigned int sysctl_numa_balancing_scan_period_max; extern unsigned int sysctl_numa_balancing_scan_size; extern unsigned int sysctl_numa_balancing_hot_threshold; -#endif #ifdef CONFIG_SCHED_HRTICK @@ -2596,9 +2866,9 @@ static inline int hrtick_enabled_dl(struct rq *rq) return hrtick_enabled(rq); } -void hrtick_start(struct rq *rq, u64 delay); +extern void hrtick_start(struct rq *rq, u64 delay); -#else +#else /* !CONFIG_SCHED_HRTICK: */ static inline int hrtick_enabled_fair(struct rq *rq) { @@ -2615,13 +2885,10 @@ static inline int hrtick_enabled(struct rq *rq) return 0; } -#endif /* CONFIG_SCHED_HRTICK */ +#endif /* !CONFIG_SCHED_HRTICK */ #ifndef arch_scale_freq_tick -static __always_inline -void arch_scale_freq_tick(void) -{ -} +static __always_inline void arch_scale_freq_tick(void) { } #endif #ifndef arch_scale_freq_capacity @@ -2642,7 +2909,6 @@ unsigned long arch_scale_freq_capacity(int cpu) } #endif -#ifdef CONFIG_SCHED_DEBUG /* * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to * acquire rq lock instead of rq_lock(). So at the end of these two functions @@ -2657,14 +2923,11 @@ static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); #endif } -#else -static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {} -#endif -#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ -__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \ -static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \ -{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \ +#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ +__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \ +static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \ +{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \ _lock; return _t; } #ifdef CONFIG_SMP @@ -2718,7 +2981,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) return 1; } -#else +#else /* !CONFIG_PREEMPTION: */ /* * Unfair double_lock_balance: Optimizes throughput at the expense of * latency by eliminating extra atomic operations when the locks are @@ -2749,7 +3012,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) return 1; } -#endif /* CONFIG_PREEMPTION */ +#endif /* !CONFIG_PREEMPTION */ /* * double_lock_balance - lock the busiest runqueue, this_rq is locked already. @@ -2825,9 +3088,10 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) extern void set_rq_online (struct rq *rq); extern void set_rq_offline(struct rq *rq); + extern bool sched_smp_initialized; -#else /* CONFIG_SMP */ +#else /* !CONFIG_SMP: */ /* * double_rq_lock - safely lock two runqueues @@ -2861,7 +3125,7 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) __release(rq2->lock); } -#endif +#endif /* !CONFIG_SMP */ DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq, double_rq_lock(_T->lock, _T->lock2), @@ -2871,7 +3135,6 @@ extern struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); -#ifdef CONFIG_SCHED_DEBUG extern bool sched_debug_verbose; extern void print_cfs_stats(struct seq_file *m, int cpu); @@ -2883,15 +3146,11 @@ extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); extern void resched_latency_warn(int cpu, u64 latency); #ifdef CONFIG_NUMA_BALANCING -extern void -show_numa_stats(struct task_struct *p, struct seq_file *m); +extern void show_numa_stats(struct task_struct *p, struct seq_file *m); extern void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, - unsigned long tpf, unsigned long gsf, unsigned long gpf); + unsigned long tpf, unsigned long gsf, unsigned long gpf); #endif /* CONFIG_NUMA_BALANCING */ -#else -static inline void resched_latency_warn(int cpu, u64 latency) {} -#endif /* CONFIG_SCHED_DEBUG */ extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq); @@ -2901,12 +3160,13 @@ extern void cfs_bandwidth_usage_inc(void); extern void cfs_bandwidth_usage_dec(void); #ifdef CONFIG_NO_HZ_COMMON + #define NOHZ_BALANCE_KICK_BIT 0 #define NOHZ_STATS_KICK_BIT 1 #define NOHZ_NEWILB_KICK_BIT 2 #define NOHZ_NEXT_KICK_BIT 3 -/* Run rebalance_domains() */ +/* Run sched_balance_domains() */ #define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) /* Update blocked load */ #define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) @@ -2915,14 +3175,14 @@ extern void cfs_bandwidth_usage_dec(void); /* Update nohz.next_balance */ #define NOHZ_NEXT_KICK BIT(NOHZ_NEXT_KICK_BIT) -#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK) +#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK) -#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) +#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) extern void nohz_balance_exit_idle(struct rq *rq); -#else +#else /* !CONFIG_NO_HZ_COMMON: */ static inline void nohz_balance_exit_idle(struct rq *rq) { } -#endif +#endif /* !CONFIG_NO_HZ_COMMON */ #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) extern void nohz_run_idle_balance(int cpu); @@ -2930,7 +3190,36 @@ extern void nohz_run_idle_balance(int cpu); static inline void nohz_run_idle_balance(int cpu) { } #endif +#include "stats.h" + +#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) + +extern void __sched_core_account_forceidle(struct rq *rq); + +static inline void sched_core_account_forceidle(struct rq *rq) +{ + if (schedstat_enabled()) + __sched_core_account_forceidle(rq); +} + +extern void __sched_core_tick(struct rq *rq); + +static inline void sched_core_tick(struct rq *rq) +{ + if (sched_core_enabled(rq) && schedstat_enabled()) + __sched_core_tick(rq); +} + +#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */ + +static inline void sched_core_account_forceidle(struct rq *rq) { } + +static inline void sched_core_tick(struct rq *rq) { } + +#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */ + #ifdef CONFIG_IRQ_TIME_ACCOUNTING + struct irqtime { u64 total; u64 tick_delta; @@ -2939,6 +3228,12 @@ struct irqtime { }; DECLARE_PER_CPU(struct irqtime, cpu_irqtime); +extern int sched_clock_irqtime; + +static inline int irqtime_enabled(void) +{ + return sched_clock_irqtime; +} /* * Returns the irqtime minus the softirq time computed by ksoftirqd. @@ -2958,9 +3253,18 @@ static inline u64 irq_time_read(int cpu) return total; } + +#else + +static inline int irqtime_enabled(void) +{ + return 0; +} + #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ #ifdef CONFIG_CPU_FREQ + DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); /** @@ -2994,9 +3298,9 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) if (data) data->func(data, rq_clock(rq), flags); } -#else -static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} -#endif /* CONFIG_CPU_FREQ */ +#else /* !CONFIG_CPU_FREQ: */ +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { } +#endif /* !CONFIG_CPU_FREQ */ #ifdef arch_scale_freq_capacity # ifndef arch_scale_freq_invariant @@ -3007,6 +3311,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif #ifdef CONFIG_SMP + unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, unsigned long *min, unsigned long *max); @@ -3049,11 +3354,40 @@ static inline unsigned long cpu_util_rt(struct rq *rq) { return READ_ONCE(rq->avg_rt.util_avg); } -#endif + +#else /* !CONFIG_SMP */ +static inline bool update_other_load_avgs(struct rq *rq) { return false; } +#endif /* CONFIG_SMP */ #ifdef CONFIG_UCLAMP_TASK + unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); +/* + * When uclamp is compiled in, the aggregation at rq level is 'turned off' + * by default in the fast path and only gets turned on once userspace performs + * an operation that requires it. + * + * Returns true if userspace opted-in to use uclamp and aggregation at rq level + * hence is active. + */ +static inline bool uclamp_is_used(void) +{ + return static_branch_likely(&sched_uclamp_used); +} + +/* + * Enabling static branches would get the cpus_read_lock(), + * check whether uclamp_is_used before enable it to avoid always + * calling cpus_read_lock(). Because we never disable this + * static key once enable it. + */ +static inline void sched_uclamp_enable(void) +{ + if (!uclamp_is_used()) + static_branch_enable(&sched_uclamp_used); +} + static inline unsigned long uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id) { @@ -3077,7 +3411,7 @@ static inline bool uclamp_rq_is_capped(struct rq *rq) unsigned long rq_util; unsigned long max_util; - if (!static_branch_likely(&sched_uclamp_used)) + if (!uclamp_is_used()) return false; rq_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq); @@ -3086,21 +3420,39 @@ static inline bool uclamp_rq_is_capped(struct rq *rq) return max_util != SCHED_CAPACITY_SCALE && rq_util >= max_util; } -/* - * When uclamp is compiled in, the aggregation at rq level is 'turned off' - * by default in the fast path and only gets turned on once userspace performs - * an operation that requires it. - * - * Returns true if userspace opted-in to use uclamp and aggregation at rq level - * hence is active. - */ -static inline bool uclamp_is_used(void) +#define for_each_clamp_id(clamp_id) \ + for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++) + +extern unsigned int sysctl_sched_uclamp_util_min_rt_default; + + +static inline unsigned int uclamp_none(enum uclamp_id clamp_id) { - return static_branch_likely(&sched_uclamp_used); + if (clamp_id == UCLAMP_MIN) + return 0; + return SCHED_CAPACITY_SCALE; +} + +/* Integer rounded range for each bucket */ +#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) + +static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) +{ + return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1); +} + +static inline void +uclamp_se_set(struct uclamp_se *uc_se, unsigned int value, bool user_defined) +{ + uc_se->value = value; + uc_se->bucket_id = uclamp_bucket_id(value); + uc_se->user_defined = user_defined; } -#else /* CONFIG_UCLAMP_TASK */ -static inline unsigned long uclamp_eff_value(struct task_struct *p, - enum uclamp_id clamp_id) + +#else /* !CONFIG_UCLAMP_TASK: */ + +static inline unsigned long +uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) { if (clamp_id == UCLAMP_MIN) return 0; @@ -3115,8 +3467,10 @@ static inline bool uclamp_is_used(void) return false; } -static inline unsigned long uclamp_rq_get(struct rq *rq, - enum uclamp_id clamp_id) +static inline void sched_uclamp_enable(void) {} + +static inline unsigned long +uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id) { if (clamp_id == UCLAMP_MIN) return 0; @@ -3124,8 +3478,8 @@ static inline unsigned long uclamp_rq_get(struct rq *rq, return SCHED_CAPACITY_SCALE; } -static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id, - unsigned int value) +static inline void +uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id, unsigned int value) { } @@ -3133,9 +3487,11 @@ static inline bool uclamp_rq_is_idle(struct rq *rq) { return false; } -#endif /* CONFIG_UCLAMP_TASK */ + +#endif /* !CONFIG_UCLAMP_TASK */ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ + static inline unsigned long cpu_util_irq(struct rq *rq) { return READ_ONCE(rq->avg_irq.util_avg); @@ -3150,7 +3506,9 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned return util; } -#else + +#else /* !CONFIG_HAVE_SCHED_AVG_IRQ: */ + static inline unsigned long cpu_util_irq(struct rq *rq) { return 0; @@ -3161,7 +3519,10 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned { return util; } -#endif + +#endif /* !CONFIG_HAVE_SCHED_AVG_IRQ */ + +extern void __setparam_fair(struct task_struct *p, const struct sched_attr *attr); #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) @@ -3174,16 +3535,16 @@ static inline bool sched_energy_enabled(void) return static_branch_unlikely(&sched_energy_present); } -extern struct cpufreq_governor schedutil_gov; - #else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ #define perf_domain_span(pd) NULL + static inline bool sched_energy_enabled(void) { return false; } #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ #ifdef CONFIG_MEMBARRIER + /* * The scheduler provides memory barriers required by membarrier between: * - prior user-space memory accesses and store to rq->membarrier_state, @@ -3205,13 +3566,16 @@ static inline void membarrier_switch_mm(struct rq *rq, WRITE_ONCE(rq->membarrier_state, membarrier_state); } -#else + +#else /* !CONFIG_MEMBARRIER :*/ + static inline void membarrier_switch_mm(struct rq *rq, struct mm_struct *prev_mm, struct mm_struct *next_mm) { } -#endif + +#endif /* !CONFIG_MEMBARRIER */ #ifdef CONFIG_SMP static inline bool is_per_cpu_kthread(struct task_struct *p) @@ -3236,6 +3600,7 @@ extern int preempt_dynamic_mode; extern int sched_dynamic_mode(const char *str); extern void sched_dynamic_update(int mode); #endif +extern const char *preempt_modes[]; #ifdef CONFIG_SCHED_MM_CID @@ -3263,7 +3628,7 @@ static inline void __mm_cid_put(struct mm_struct *mm, int cid) * be held to transition to other states. * * State transitions synchronized with cmpxchg or try_cmpxchg need to be - * consistent across cpus, which prevents use of this_cpu_cmpxchg. + * consistent across CPUs, which prevents use of this_cpu_cmpxchg. */ static inline void mm_cid_put_lazy(struct task_struct *t) { @@ -3311,25 +3676,62 @@ static inline void mm_cid_put(struct mm_struct *mm) __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); } -static inline int __mm_cid_try_get(struct mm_struct *mm) +static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) { - struct cpumask *cpumask; - int cid; + struct cpumask *cidmask = mm_cidmask(mm); + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; + int cid, max_nr_cid, allowed_max_nr_cid; - cpumask = mm_cidmask(mm); /* + * After shrinking the number of threads or reducing the number + * of allowed cpus, reduce the value of max_nr_cid so expansion + * of cid allocation will preserve cache locality if the number + * of threads or allowed cpus increase again. + */ + max_nr_cid = atomic_read(&mm->max_nr_cid); + while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), + atomic_read(&mm->mm_users))), + max_nr_cid > allowed_max_nr_cid) { + /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */ + if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) { + max_nr_cid = allowed_max_nr_cid; + break; + } + } + /* Try to re-use recent cid. This improves cache locality. */ + cid = __this_cpu_read(pcpu_cid->recent_cid); + if (!mm_cid_is_unset(cid) && cid < max_nr_cid && + !cpumask_test_and_set_cpu(cid, cidmask)) + return cid; + /* + * Expand cid allocation if the maximum number of concurrency + * IDs allocated (max_nr_cid) is below the number cpus allowed + * and number of threads. Expanding cid allocation as much as + * possible improves cache locality. + */ + cid = max_nr_cid; + while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { + /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */ + if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) + continue; + if (!cpumask_test_and_set_cpu(cid, cidmask)) + return cid; + } + /* + * Find the first available concurrency id. * Retry finding first zero bit if the mask is temporarily * filled. This only happens during concurrent remote-clear * which owns a cid without holding a rq lock. */ for (;;) { - cid = cpumask_first_zero(cpumask); - if (cid < nr_cpu_ids) + cid = cpumask_first_zero(cidmask); + if (cid < READ_ONCE(mm->nr_cpus_allowed)) break; cpu_relax(); } - if (cpumask_test_and_set_cpu(cid, cpumask)) + if (cpumask_test_and_set_cpu(cid, cidmask)) return -1; + return cid; } @@ -3345,7 +3747,8 @@ static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm) WRITE_ONCE(pcpu_cid->time, rq->clock); } -static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) +static inline int __mm_cid_get(struct rq *rq, struct task_struct *t, + struct mm_struct *mm) { int cid; @@ -3355,13 +3758,13 @@ static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) * guarantee forward progress. */ if (!READ_ONCE(use_cid_lock)) { - cid = __mm_cid_try_get(mm); + cid = __mm_cid_try_get(t, mm); if (cid >= 0) goto end; raw_spin_lock(&cid_lock); } else { raw_spin_lock(&cid_lock); - cid = __mm_cid_try_get(mm); + cid = __mm_cid_try_get(t, mm); if (cid >= 0) goto unlock; } @@ -3381,7 +3784,7 @@ static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) * all newcoming allocations observe the use_cid_lock flag set. */ do { - cid = __mm_cid_try_get(mm); + cid = __mm_cid_try_get(t, mm); cpu_relax(); } while (cid < 0); /* @@ -3394,10 +3797,12 @@ unlock: raw_spin_unlock(&cid_lock); end: mm_cid_snapshot_time(rq, mm); + return cid; } -static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm) +static inline int mm_cid_get(struct rq *rq, struct task_struct *t, + struct mm_struct *mm) { struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; struct cpumask *cpumask; @@ -3414,8 +3819,10 @@ static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm) if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); } - cid = __mm_cid_get(rq, mm); + cid = __mm_cid_get(rq, t, mm); __this_cpu_write(pcpu_cid->cid, cid); + __this_cpu_write(pcpu_cid->recent_cid, cid); + return cid; } @@ -3467,18 +3874,115 @@ static inline void switch_mm_cid(struct rq *rq, prev->mm_cid = -1; } if (next->mm_cid_active) - next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm); + next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm); } -#else +#else /* !CONFIG_SCHED_MM_CID: */ static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { } static inline void sched_mm_cid_migrate_from(struct task_struct *t) { } static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { } static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } static inline void init_sched_mm_cid(struct task_struct *t) { } -#endif +#endif /* !CONFIG_SCHED_MM_CID */ extern u64 avg_vruntime(struct cfs_rq *cfs_rq); extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); +#ifdef CONFIG_SMP +static inline +void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task) +{ + lockdep_assert_rq_held(src_rq); + lockdep_assert_rq_held(dst_rq); + + deactivate_task(src_rq, task, 0); + set_task_cpu(task, dst_rq->cpu); + activate_task(dst_rq, task, 0); +} + +static inline +bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu) +{ + if (!task_on_cpu(rq, p) && + cpumask_test_cpu(cpu, &p->cpus_mask)) + return true; + + return false; +} +#endif + +#ifdef CONFIG_RT_MUTEXES + +static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) +{ + if (pi_task) + prio = min(prio, pi_task->prio); + + return prio; +} + +static inline int rt_effective_prio(struct task_struct *p, int prio) +{ + struct task_struct *pi_task = rt_mutex_get_top_task(p); + + return __rt_effective_prio(pi_task, prio); +} + +#else /* !CONFIG_RT_MUTEXES: */ + +static inline int rt_effective_prio(struct task_struct *p, int prio) +{ + return prio; +} + +#endif /* !CONFIG_RT_MUTEXES */ + +extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi); +extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx); +extern const struct sched_class *__setscheduler_class(int policy, int prio); +extern void set_load_weight(struct task_struct *p, bool update_load); +extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); +extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags); + +extern void check_class_changing(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class); +extern void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio); + +#ifdef CONFIG_SMP +extern struct balance_callback *splice_balance_callbacks(struct rq *rq); +extern void balance_callbacks(struct rq *rq, struct balance_callback *head); +#else + +static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) +{ + return NULL; +} + +static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) +{ +} + +#endif + +#ifdef CONFIG_SCHED_CLASS_EXT +/* + * Used by SCX in the enable/disable paths to move tasks between sched_classes + * and establish invariants. + */ +struct sched_enq_and_set_ctx { + struct task_struct *p; + int queue_flags; + bool queued; + bool running; +}; + +void sched_deq_and_put_task(struct task_struct *p, int queue_flags, + struct sched_enq_and_set_ctx *ctx); +void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx); + +#endif /* CONFIG_SCHED_CLASS_EXT */ + +#include "ext.h" #endif /* _KERNEL_SCHED_SCHED_H */ |