diff options
Diffstat (limited to 'kernel/sched/sched.h')
-rw-r--r-- | kernel/sched/sched.h | 1086 |
1 files changed, 808 insertions, 278 deletions
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 001fe047bd5d..023b844159c9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -68,17 +68,26 @@ #include <linux/wait_api.h> #include <linux/wait_bit.h> #include <linux/workqueue_api.h> +#include <linux/delayacct.h> #include <trace/events/power.h> #include <trace/events/sched.h> #include "../workqueue_internal.h" +struct rq; +struct cfs_rq; +struct rt_rq; +struct sched_group; +struct cpuidle_state; + #ifdef CONFIG_PARAVIRT # include <asm/paravirt.h> # include <asm/paravirt_api_clock.h> #endif +#include <asm/barrier.h> + #include "cpupri.h" #include "cpudeadline.h" @@ -88,9 +97,6 @@ # define SCHED_WARN_ON(x) ({ (void)(x), 0; }) #endif -struct rq; -struct cpuidle_state; - /* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 #define TASK_ON_RQ_MIGRATING 2 @@ -110,14 +116,28 @@ extern int sysctl_sched_rt_runtime; extern int sched_rr_timeslice; /* + * Asymmetric CPU capacity bits + */ +struct asym_cap_data { + struct list_head link; + struct rcu_head rcu; + unsigned long capacity; + unsigned long cpus[]; +}; + +extern struct list_head asym_cap_list; + +#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus) + +/* * Helpers for converting nanosecond timing to jiffy resolution */ -#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) +#define NS_TO_JIFFIES(time) ((unsigned long)(time) / (NSEC_PER_SEC/HZ)) /* * Increase resolution of nice-level calculations for 64-bit architectures. * The extra resolution improves shares distribution and load balancing of - * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup + * low-weight task groups (eg. nice +19 on an autogroup), deeper task-group * hierarchies, especially on larger systems. This is not a user-visible change * and does not change the user-interface for setting shares/weights. * @@ -131,12 +151,13 @@ extern int sched_rr_timeslice; #ifdef CONFIG_64BIT # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) # define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT) -# define scale_load_down(w) \ -({ \ - unsigned long __w = (w); \ - if (__w) \ - __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \ - __w; \ +# define scale_load_down(w) \ +({ \ + unsigned long __w = (w); \ + \ + if (__w) \ + __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \ + __w; \ }) #else # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT) @@ -171,9 +192,19 @@ static inline int idle_policy(int policy) { return policy == SCHED_IDLE; } + +static inline int normal_policy(int policy) +{ +#ifdef CONFIG_SCHED_CLASS_EXT + if (policy == SCHED_EXT) + return true; +#endif + return policy == SCHED_NORMAL; +} + static inline int fair_policy(int policy) { - return policy == SCHED_NORMAL || policy == SCHED_BATCH; + return normal_policy(policy) || policy == SCHED_BATCH; } static inline int rt_policy(int policy) @@ -185,6 +216,7 @@ static inline int dl_policy(int policy) { return policy == SCHED_DEADLINE; } + static inline bool valid_policy(int policy) { return idle_policy(policy) || fair_policy(policy) || @@ -206,11 +238,12 @@ static inline int task_has_dl_policy(struct task_struct *p) return dl_policy(p->policy); } -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) static inline void update_avg(u64 *avg, u64 sample) { s64 diff = sample - *avg; + *avg += diff / 8; } @@ -222,6 +255,24 @@ static inline void update_avg(u64 *avg, u64 sample) (val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1)) /* + * cgroup weight knobs should use the common MIN, DFL and MAX values which are + * 1, 100 and 10000 respectively. While it loses a bit of range on both ends, it + * maps pretty well onto the shares value used by scheduler and the round-trip + * conversions preserve the original value over the entire range. + */ +static inline unsigned long sched_weight_from_cgroup(unsigned long cgrp_weight) +{ + return DIV_ROUND_CLOSEST_ULL(cgrp_weight * 1024, CGROUP_WEIGHT_DFL); +} + +static inline unsigned long sched_weight_to_cgroup(unsigned long weight) +{ + return clamp_t(unsigned long, + DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024), + CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX); +} + +/* * !! For sched_setattr_nocheck() (kernel) only !! * * This is actually gross. :( @@ -235,7 +286,7 @@ static inline void update_avg(u64 *avg, u64 sample) */ #define SCHED_FLAG_SUGOV 0x10000000 -#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) +#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) static inline bool dl_entity_is_special(const struct sched_dl_entity *dl_se) { @@ -311,8 +362,8 @@ extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); extern bool __checkparam_dl(const struct sched_attr *attr); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); -extern int dl_bw_check_overflow(int cpu); - +extern int dl_bw_deactivate(int cpu); +extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec); /* * SCHED_DEADLINE supports servers (nested scheduling) with the following * interface: @@ -338,12 +389,21 @@ extern void dl_server_start(struct sched_dl_entity *dl_se); extern void dl_server_stop(struct sched_dl_entity *dl_se); extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_has_tasks_f has_tasks, - dl_server_pick_f pick); + dl_server_pick_f pick_task); -#ifdef CONFIG_CGROUP_SCHED +extern void dl_server_update_idle_time(struct rq *rq, + struct task_struct *p); +extern void fair_server_init(struct rq *rq); +extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq); +extern int dl_server_apply_params(struct sched_dl_entity *dl_se, + u64 runtime, u64 period, bool init); -struct cfs_rq; -struct rt_rq; +static inline bool dl_server_active(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_server_active; +} + +#ifdef CONFIG_CGROUP_SCHED extern struct list_head task_groups; @@ -377,20 +437,21 @@ struct cfs_bandwidth { struct task_group { struct cgroup_subsys_state css; +#ifdef CONFIG_GROUP_SCHED_WEIGHT + /* A positive value indicates that this is a SCHED_IDLE group. */ + int idle; +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED /* schedulable entities of this group on each CPU */ struct sched_entity **se; /* runqueue "owned" by this group on each CPU */ struct cfs_rq **cfs_rq; unsigned long shares; - - /* A positive value indicates that this is a SCHED_IDLE group. */ - int idle; - #ifdef CONFIG_SMP /* * load_avg can be heavily contended at clock tick time, so put - * it in its own cacheline separated from the fields above which + * it in its own cache-line separated from the fields above which * will also be accessed at each tick. */ atomic_long_t load_avg ____cacheline_aligned; @@ -404,6 +465,11 @@ struct task_group { struct rt_bandwidth rt_bandwidth; #endif +#ifdef CONFIG_EXT_GROUP_SCHED + u32 scx_flags; /* SCX_TG_* */ + u32 scx_weight; +#endif + struct rcu_head rcu; struct list_head list; @@ -428,7 +494,7 @@ struct task_group { }; -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED_WEIGHT #define ROOT_TASK_GROUP_LOAD NICE_0_LOAD /* @@ -459,6 +525,11 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) return walk_tg_tree_from(&root_task_group, down, up, data); } +static inline struct task_group *css_tg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct task_group, css) : NULL; +} + extern int tg_nop(struct task_group *tg, void *data); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -501,7 +572,7 @@ extern void sched_online_group(struct task_group *tg, extern void sched_destroy_group(struct task_group *tg); extern void sched_release_group(struct task_group *tg); -extern void sched_move_task(struct task_struct *tsk); +extern void sched_move_task(struct task_struct *tsk, bool for_autogroup); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); @@ -515,11 +586,15 @@ extern void set_task_rq_fair(struct sched_entity *se, static inline void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next) { } #endif /* CONFIG_SMP */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ +static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; } +static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; } #endif /* CONFIG_FAIR_GROUP_SCHED */ #else /* CONFIG_CGROUP_SCHED */ struct cfs_bandwidth { }; + static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; } #endif /* CONFIG_CGROUP_SCHED */ @@ -535,8 +610,8 @@ extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent * applicable for 32-bits architectures. */ #ifdef CONFIG_64BIT -# define u64_u32_load_copy(var, copy) var -# define u64_u32_store_copy(var, copy, val) (var = val) +# define u64_u32_load_copy(var, copy) var +# define u64_u32_store_copy(var, copy, val) (var = val) #else # define u64_u32_load_copy(var, copy) \ ({ \ @@ -564,31 +639,31 @@ do { \ copy = __val; \ } while (0) #endif -# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy) -# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val) +# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy) +# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val) + +struct balance_callback { + struct balance_callback *next; + void (*func)(struct rq *rq); +}; /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; - unsigned int nr_running; - unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ - unsigned int idle_nr_running; /* SCHED_IDLE */ - unsigned int idle_h_nr_running; /* SCHED_IDLE */ + unsigned int nr_queued; + unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_idle; /* SCHED_IDLE */ s64 avg_vruntime; u64 avg_load; - u64 exec_clock; u64 min_vruntime; #ifdef CONFIG_SCHED_CORE unsigned int forceidle_seq; u64 min_vruntime_fi; #endif -#ifndef CONFIG_64BIT - u64 min_vruntime_copy; -#endif - struct rb_root_cached tasks_timeline; /* @@ -598,10 +673,6 @@ struct cfs_rq { struct sched_entity *curr; struct sched_entity *next; -#ifdef CONFIG_SCHED_DEBUG - unsigned int nr_spread_over; -#endif - #ifdef CONFIG_SMP /* * CFS load tracking @@ -675,6 +746,47 @@ struct cfs_rq { #endif /* CONFIG_FAIR_GROUP_SCHED */ }; +#ifdef CONFIG_SCHED_CLASS_EXT +/* scx_rq->flags, protected by the rq lock */ +enum scx_rq_flags { + /* + * A hotplugged CPU starts scheduling before rq_online_scx(). Track + * ops.cpu_on/offline() state so that ops.enqueue/dispatch() are called + * only while the BPF scheduler considers the CPU to be online. + */ + SCX_RQ_ONLINE = 1 << 0, + SCX_RQ_CAN_STOP_TICK = 1 << 1, + SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */ + SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */ + SCX_RQ_BYPASSING = 1 << 4, + SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */ + + SCX_RQ_IN_WAKEUP = 1 << 16, + SCX_RQ_IN_BALANCE = 1 << 17, +}; + +struct scx_rq { + struct scx_dispatch_q local_dsq; + struct list_head runnable_list; /* runnable tasks on this rq */ + struct list_head ddsp_deferred_locals; /* deferred ddsps from enq */ + unsigned long ops_qseq; + u64 extra_enq_flags; /* see move_task_to_local_dsq() */ + u32 nr_running; + u32 cpuperf_target; /* [0, SCHED_CAPACITY_SCALE] */ + bool cpu_released; + u32 flags; + u64 clock; /* current per-rq clock -- see scx_bpf_now() */ + cpumask_var_t cpus_to_kick; + cpumask_var_t cpus_to_kick_if_idle; + cpumask_var_t cpus_to_preempt; + cpumask_var_t cpus_to_wait; + unsigned long pnt_seq; + struct balance_callback deferred_bal_cb; + struct irq_work deferred_irq_work; + struct irq_work kick_cpus_irq_work; +}; +#endif /* CONFIG_SCHED_CLASS_EXT */ + static inline int rt_bandwidth_enabled(void) { return sysctl_sched_rt_runtime >= 0; @@ -699,19 +811,19 @@ struct rt_rq { } highest_prio; #endif #ifdef CONFIG_SMP - int overloaded; + bool overloaded; struct plist_head pushable_tasks; #endif /* CONFIG_SMP */ int rt_queued; +#ifdef CONFIG_RT_GROUP_SCHED int rt_throttled; u64 rt_time; u64 rt_runtime; /* Nests inside the rq lock: */ raw_spinlock_t rt_runtime_lock; -#ifdef CONFIG_RT_GROUP_SCHED unsigned int rt_nr_boosted; struct rq *rq; @@ -743,7 +855,7 @@ struct dl_rq { u64 next; } earliest_dl; - int overloaded; + bool overloaded; /* * Tasks on this rq that can be pushed away. They are kept in @@ -787,33 +899,42 @@ struct dl_rq { }; #ifdef CONFIG_FAIR_GROUP_SCHED + /* An entity is a task if it doesn't "own" a runqueue */ #define entity_is_task(se) (!se->my_q) static inline void se_update_runnable(struct sched_entity *se) { if (!entity_is_task(se)) - se->runnable_weight = se->my_q->h_nr_running; + se->runnable_weight = se->my_q->h_nr_runnable; } static inline long se_runnable(struct sched_entity *se) { + if (se->sched_delayed) + return false; + if (entity_is_task(se)) return !!se->on_rq; else return se->runnable_weight; } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ + #define entity_is_task(se) 1 -static inline void se_update_runnable(struct sched_entity *se) {} +static inline void se_update_runnable(struct sched_entity *se) { } static inline long se_runnable(struct sched_entity *se) { + if (se->sched_delayed) + return false; + return !!se->on_rq; } -#endif + +#endif /* !CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP /* @@ -836,10 +957,6 @@ struct perf_domain { struct rcu_head rcu; }; -/* Scheduling group status flags */ -#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ -#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ - /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by @@ -860,10 +977,10 @@ struct root_domain { * - More than one runnable task * - Running task is misfit */ - int overload; + bool overloaded; - /* Indicate one or more cpus over-utilized (tipping point) */ - int overutilized; + /* Indicate one or more CPUs over-utilized (tipping point) */ + bool overutilized; /* * The bit corresponding to a CPU gets set here if such CPU has more @@ -903,8 +1020,6 @@ struct root_domain { cpumask_var_t rto_mask; struct cpupri cpupri; - unsigned long max_cpu_capacity; - /* * NULL-terminated list of performance domains intersecting with the * CPUs of the rd. Protected by RCU. @@ -918,6 +1033,17 @@ extern void rq_attach_root(struct rq *rq, struct root_domain *rd); extern void sched_get_rd(struct root_domain *rd); extern void sched_put_rd(struct root_domain *rd); +static inline int get_rd_overloaded(struct root_domain *rd) +{ + return READ_ONCE(rd->overloaded); +} + +static inline void set_rd_overloaded(struct root_domain *rd, int status) +{ + if (get_rd_overloaded(rd) != status) + WRITE_ONCE(rd->overloaded, status); +} + #ifdef HAVE_RT_PUSH_IPI extern void rto_push_irq_work_func(struct irq_work *work); #endif @@ -967,12 +1093,6 @@ struct uclamp_rq { DECLARE_STATIC_KEY_FALSE(sched_uclamp_used); #endif /* CONFIG_UCLAMP_TASK */ -struct rq; -struct balance_callback { - struct balance_callback *next; - void (*func)(struct rq *rq); -}; - /* * This is the main, per-CPU runqueue data structure. * @@ -1015,6 +1135,11 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; struct dl_rq dl; +#ifdef CONFIG_SCHED_CLASS_EXT + struct scx_rq scx; +#endif + + struct sched_dl_entity fair_server; #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this CPU: */ @@ -1030,7 +1155,11 @@ struct rq { */ unsigned int nr_uninterruptible; - struct task_struct __rcu *curr; + union { + struct task_struct __rcu *donor; /* Scheduler context */ + struct task_struct __rcu *curr; /* Execution context */ + }; + struct sched_dl_entity *dl_server; struct task_struct *idle; struct task_struct *stop; unsigned long next_balance; @@ -1089,8 +1218,8 @@ struct rq { #ifdef CONFIG_HAVE_SCHED_AVG_IRQ struct sched_avg avg_irq; #endif -#ifdef CONFIG_SCHED_THERMAL_PRESSURE - struct sched_avg avg_thermal; +#ifdef CONFIG_SCHED_HW_PRESSURE + struct sched_avg avg_hw; #endif u64 idle_stamp; u64 avg_idle; @@ -1105,6 +1234,7 @@ struct rq { #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; + u64 psi_irq_time; #endif #ifdef CONFIG_PARAVIRT u64 prev_steal_time; @@ -1122,14 +1252,13 @@ struct rq { call_single_data_t hrtick_csd; #endif struct hrtimer hrtick_timer; - ktime_t hrtick_time; + ktime_t hrtick_time; #endif #ifdef CONFIG_SCHEDSTATS /* latency stats */ struct sched_info rq_sched_info; unsigned long long rq_cpu_time; - /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ /* sys_sched_yield() stats */ unsigned int yld_count; @@ -1144,7 +1273,7 @@ struct rq { #endif #ifdef CONFIG_CPU_IDLE - /* Must be inspected within a rcu lock section */ + /* Must be inspected within a RCU lock section */ struct cpuidle_state *idle_state; #endif @@ -1158,6 +1287,7 @@ struct rq { /* per rq */ struct rq *core; struct task_struct *core_pick; + struct sched_dl_entity *core_dl_server; unsigned int core_enabled; unsigned int core_sched_seq; struct rb_root core_tree; @@ -1206,7 +1336,7 @@ static inline int cpu_of(struct rq *rq) #endif } -#define MDF_PUSH 0x01 +#define MDF_PUSH 0x01 static inline bool is_migration_disabled(struct task_struct *p) { @@ -1225,7 +1355,11 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) -struct sched_group; +static inline void rq_set_donor(struct rq *rq, struct task_struct *t) +{ + /* Do nothing */ +} + #ifdef CONFIG_SCHED_CORE static inline struct cpumask *sched_group_span(struct sched_group *sg); @@ -1261,9 +1395,10 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq) return &rq->__lock; } -bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, - bool fi); -void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi); +extern bool +cfs_prio_less(const struct task_struct *a, const struct task_struct *b, bool fi); + +extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi); /* * Helpers to check if the CPU's core cookie matches with the task's cookie @@ -1331,7 +1466,7 @@ extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags); extern void sched_core_get(void); extern void sched_core_put(void); -#else /* !CONFIG_SCHED_CORE */ +#else /* !CONFIG_SCHED_CORE: */ static inline bool sched_core_enabled(struct rq *rq) { @@ -1369,7 +1504,8 @@ static inline bool sched_group_cookie_match(struct rq *rq, { return true; } -#endif /* CONFIG_SCHED_CORE */ + +#endif /* !CONFIG_SCHED_CORE */ static inline void lockdep_assert_rq_held(struct rq *rq) { @@ -1400,8 +1536,10 @@ static inline void raw_spin_rq_unlock_irq(struct rq *rq) static inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq) { unsigned long flags; + local_irq_save(flags); raw_spin_rq_lock(rq); + return flags; } @@ -1430,6 +1568,7 @@ static inline void update_idle_core(struct rq *rq) { } #endif #ifdef CONFIG_FAIR_GROUP_SCHED + static inline struct task_struct *task_of(struct sched_entity *se) { SCHED_WARN_ON(!entity_is_task(se)); @@ -1453,9 +1592,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ -#define task_of(_se) container_of(_se, struct task_struct, se) +#define task_of(_se) container_of(_se, struct task_struct, se) static inline struct cfs_rq *task_cfs_rq(const struct task_struct *p) { @@ -1475,7 +1614,8 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) { return NULL; } -#endif + +#endif /* !CONFIG_FAIR_GROUP_SCHED */ extern void update_rq_clock(struct rq *rq); @@ -1531,24 +1671,6 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } -/** - * By default the decay is the default pelt decay period. - * The decay shift can change the decay period in - * multiples of 32. - * Decay shift Decay period(ms) - * 0 32 - * 1 64 - * 2 128 - * 3 256 - * 4 512 - */ -extern int sched_thermal_decay_shift; - -static inline u64 rq_clock_thermal(struct rq *rq) -{ - return rq_clock_task(rq) >> sched_thermal_decay_shift; -} - static inline void rq_clock_skip_update(struct rq *rq) { lockdep_assert_rq_held(rq); @@ -1602,6 +1724,38 @@ struct rq_flags { extern struct balance_callback balance_push_callback; +#ifdef CONFIG_SCHED_CLASS_EXT +extern const struct sched_class ext_sched_class; + +DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); /* SCX BPF scheduler loaded */ +DECLARE_STATIC_KEY_FALSE(__scx_switched_all); /* all fair class tasks on SCX */ + +#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled) +#define scx_switched_all() static_branch_unlikely(&__scx_switched_all) + +static inline void scx_rq_clock_update(struct rq *rq, u64 clock) +{ + if (!scx_enabled()) + return; + WRITE_ONCE(rq->scx.clock, clock); + smp_store_release(&rq->scx.flags, rq->scx.flags | SCX_RQ_CLK_VALID); +} + +static inline void scx_rq_clock_invalidate(struct rq *rq) +{ + if (!scx_enabled()) + return; + WRITE_ONCE(rq->scx.flags, rq->scx.flags & ~SCX_RQ_CLK_VALID); +} + +#else /* !CONFIG_SCHED_CLASS_EXT */ +#define scx_enabled() false +#define scx_switched_all() false + +static inline void scx_rq_clock_update(struct rq *rq, u64 clock) {} +static inline void scx_rq_clock_invalidate(struct rq *rq) {} +#endif /* !CONFIG_SCHED_CLASS_EXT */ + /* * Lockdep annotation that avoids accidental unlocks; it's like a * sticky/continuous lockdep_assert_held(). @@ -1619,9 +1773,9 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) #ifdef CONFIG_SCHED_DEBUG rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rf->clock_update_flags = 0; -#ifdef CONFIG_SMP +# ifdef CONFIG_SMP SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != &balance_push_callback); -#endif +# endif #endif } @@ -1631,7 +1785,7 @@ static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) if (rq->clock_update_flags > RQCF_ACT_SKIP) rf->clock_update_flags = RQCF_UPDATED; #endif - + scx_rq_clock_invalidate(rq); lockdep_unpin_lock(__rq_lockp(rq), rf->cookie); } @@ -1647,9 +1801,11 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) #endif } +extern struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(rq->lock); +extern struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(p->pi_lock) __acquires(rq->lock); @@ -1676,48 +1832,42 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct, task_rq_unlock(_T->rq, _T->lock, &_T->rf), struct rq *rq; struct rq_flags rf) -static inline void -rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) +static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { raw_spin_rq_lock_irqsave(rq, rf->flags); rq_pin_lock(rq, rf); } -static inline void -rq_lock_irq(struct rq *rq, struct rq_flags *rf) +static inline void rq_lock_irq(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { raw_spin_rq_lock_irq(rq); rq_pin_lock(rq, rf); } -static inline void -rq_lock(struct rq *rq, struct rq_flags *rf) +static inline void rq_lock(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { raw_spin_rq_lock(rq); rq_pin_lock(rq, rf); } -static inline void -rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) +static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); raw_spin_rq_unlock_irqrestore(rq, rf->flags); } -static inline void -rq_unlock_irq(struct rq *rq, struct rq_flags *rf) +static inline void rq_unlock_irq(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); raw_spin_rq_unlock_irq(rq); } -static inline void -rq_unlock(struct rq *rq, struct rq_flags *rf) +static inline void rq_unlock(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); @@ -1739,8 +1889,7 @@ DEFINE_LOCK_GUARD_1(rq_lock_irqsave, struct rq, rq_unlock_irqrestore(_T->lock, &_T->rf), struct rq_flags rf) -static inline struct rq * -this_rq_lock_irq(struct rq_flags *rf) +static inline struct rq *this_rq_lock_irq(struct rq_flags *rf) __acquires(rq->lock) { struct rq *rq; @@ -1748,15 +1897,18 @@ this_rq_lock_irq(struct rq_flags *rf) local_irq_disable(); rq = this_rq(); rq_lock(rq, rf); + return rq; } #ifdef CONFIG_NUMA + enum numa_topology_type { NUMA_DIRECT, NUMA_GLUELESS_MESH, NUMA_BACKPLANE, }; + extern enum numa_topology_type sched_numa_topology_type; extern int sched_max_numa_distance; extern bool find_numa_distance(int distance); @@ -1765,18 +1917,23 @@ extern void sched_update_numa(int cpu, bool online); extern void sched_domains_numa_masks_set(unsigned int cpu); extern void sched_domains_numa_masks_clear(unsigned int cpu); extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); -#else + +#else /* !CONFIG_NUMA: */ + static inline void sched_init_numa(int offline_node) { } static inline void sched_update_numa(int cpu, bool online) { } static inline void sched_domains_numa_masks_set(unsigned int cpu) { } static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } + static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) { return nr_cpu_ids; } -#endif + +#endif /* !CONFIG_NUMA */ #ifdef CONFIG_NUMA_BALANCING + /* The regions in numa_faults array from task_struct */ enum numa_faults_stats { NUMA_MEM = 0, @@ -1784,17 +1941,21 @@ enum numa_faults_stats { NUMA_MEMBUF, NUMA_CPUBUF }; + extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *p, struct task_struct *t, int cpu, int scpu); extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); -#else + +#else /* !CONFIG_NUMA_BALANCING: */ + static inline void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) { } -#endif /* CONFIG_NUMA_BALANCING */ + +#endif /* !CONFIG_NUMA_BALANCING */ #ifdef CONFIG_SMP @@ -1819,8 +1980,7 @@ queue_balance_callback(struct rq *rq, } #define rcu_dereference_check_sched_domain(p) \ - rcu_dereference_check((p), \ - lockdep_is_held(&sched_domains_mutex)) + rcu_dereference_check((p), lockdep_is_held(&sched_domains_mutex)) /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. @@ -1891,6 +2051,7 @@ DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + extern struct static_key_false sched_asym_cpucapacity; extern struct static_key_false sched_cluster_active; @@ -1954,15 +2115,11 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg) extern int group_balance_cpu(struct sched_group *sg); #ifdef CONFIG_SCHED_DEBUG -void update_sched_domain_debugfs(void); -void dirty_sched_domain_sysctl(int cpu); +extern void update_sched_domain_debugfs(void); +extern void dirty_sched_domain_sysctl(int cpu); #else -static inline void update_sched_domain_debugfs(void) -{ -} -static inline void dirty_sched_domain_sysctl(int cpu) -{ -} +static inline void update_sched_domain_debugfs(void) { } +static inline void dirty_sched_domain_sysctl(int cpu) { } #endif extern int sched_update_scaling(void); @@ -1973,35 +2130,8 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p) return cpu_possible_mask; /* &init_task.cpus_mask */ return p->user_cpus_ptr; } -#endif /* CONFIG_SMP */ - -#include "stats.h" - -#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) -extern void __sched_core_account_forceidle(struct rq *rq); - -static inline void sched_core_account_forceidle(struct rq *rq) -{ - if (schedstat_enabled()) - __sched_core_account_forceidle(rq); -} - -extern void __sched_core_tick(struct rq *rq); - -static inline void sched_core_tick(struct rq *rq) -{ - if (sched_core_enabled(rq) && schedstat_enabled()) - __sched_core_tick(rq); -} - -#else - -static inline void sched_core_account_forceidle(struct rq *rq) {} - -static inline void sched_core_tick(struct rq *rq) {} - -#endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */ +#endif /* CONFIG_SMP */ #ifdef CONFIG_CGROUP_SCHED @@ -2043,15 +2173,16 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif } -#else /* CONFIG_CGROUP_SCHED */ +#else /* !CONFIG_CGROUP_SCHED: */ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } + static inline struct task_group *task_group(struct task_struct *p) { return NULL; } -#endif /* CONFIG_CGROUP_SCHED */ +#endif /* !CONFIG_CGROUP_SCHED */ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { @@ -2096,6 +2227,7 @@ enum { extern const_debug unsigned int sysctl_sched_features; #ifdef CONFIG_JUMP_LABEL + #define SCHED_FEAT(name, enabled) \ static __always_inline bool static_branch_##name(struct static_key *key) \ { \ @@ -2108,13 +2240,13 @@ static __always_inline bool static_branch_##name(struct static_key *key) \ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) -#else /* !CONFIG_JUMP_LABEL */ +#else /* !CONFIG_JUMP_LABEL: */ #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) -#endif /* CONFIG_JUMP_LABEL */ +#endif /* !CONFIG_JUMP_LABEL */ -#else /* !SCHED_DEBUG */ +#else /* !SCHED_DEBUG: */ /* * Each translation unit has its own copy of sysctl_sched_features to allow @@ -2130,7 +2262,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features = #define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) -#endif /* SCHED_DEBUG */ +#endif /* !SCHED_DEBUG */ extern struct static_key_false sched_numa_balancing; extern struct static_key_false sched_schedstats; @@ -2148,11 +2280,25 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } +/* + * Is p the current execution context? + */ static inline int task_current(struct rq *rq, struct task_struct *p) { return rq->curr == p; } +/* + * Is p the current scheduling context? + * + * Note that it might be the current execution context at the same time if + * rq->curr == rq->donor == p. + */ +static inline int task_current_donor(struct rq *rq, struct task_struct *p) +{ + return rq->donor == p; +} + static inline int task_on_cpu(struct rq *rq, struct task_struct *p) { #ifdef CONFIG_SMP @@ -2164,7 +2310,7 @@ static inline int task_on_cpu(struct rq *rq, struct task_struct *p) static inline int task_on_rq_queued(struct task_struct *p) { - return p->on_rq == TASK_ON_RQ_QUEUED; + return READ_ONCE(p->on_rq) == TASK_ON_RQ_QUEUED; } static inline int task_on_rq_migrating(struct task_struct *p) @@ -2173,13 +2319,14 @@ static inline int task_on_rq_migrating(struct task_struct *p) } /* Wake flags. The first three directly map to some SD flag value */ -#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ -#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ -#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */ +#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ +#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ +#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */ -#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ -#define WF_MIGRATED 0x20 /* Internal use, task got migrated */ -#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ +#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ +#define WF_MIGRATED 0x20 /* Internal use, task got migrated */ +#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ +#define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */ #ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); @@ -2222,14 +2369,17 @@ extern const u32 sched_prio_to_wmult[40]; * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) * ENQUEUE_MIGRATED - the task was migrated during wakeup + * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called * */ -#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */ #define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ +#define DEQUEUE_SPECIAL 0x10 #define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ +#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 @@ -2245,13 +2395,15 @@ extern const u32 sched_prio_to_wmult[40]; #endif #define ENQUEUE_INITIAL 0x80 #define ENQUEUE_MIGRATING 0x100 +#define ENQUEUE_DELAYED 0x200 +#define ENQUEUE_RQ_SELECTED 0x400 #define RETRY_TASK ((void *)-1UL) struct affinity_context { - const struct cpumask *new_mask; - struct cpumask *user_mask; - unsigned int flags; + const struct cpumask *new_mask; + struct cpumask *user_mask; + unsigned int flags; }; extern s64 update_curr_common(struct rq *rq); @@ -2263,23 +2415,31 @@ struct sched_class { #endif void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); - void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); + bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); void (*yield_task) (struct rq *rq); bool (*yield_to_task)(struct rq *rq, struct task_struct *p); void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); - struct task_struct *(*pick_next_task)(struct rq *rq); + int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); + struct task_struct *(*pick_task)(struct rq *rq); + /* + * Optional! When implemented pick_next_task() should be equivalent to: + * + * next = pick_task(); + * if (next) { + * put_prev_task(prev); + * set_next_task_first(next); + * } + */ + struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev); - void (*put_prev_task)(struct rq *rq, struct task_struct *p); + void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next); void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); #ifdef CONFIG_SMP - int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); - struct task_struct * (*pick_task)(struct rq *rq); - void (*migrate_task_rq)(struct task_struct *p, int new_cpu); void (*task_woken)(struct rq *this_rq, struct task_struct *task); @@ -2301,8 +2461,11 @@ struct sched_class { * cannot assume the switched_from/switched_to pair is serialized by * rq->lock. They are however serialized by p->pi_lock. */ + void (*switching_to) (struct rq *this_rq, struct task_struct *task); void (*switched_from)(struct rq *this_rq, struct task_struct *task); void (*switched_to) (struct rq *this_rq, struct task_struct *task); + void (*reweight_task)(struct rq *this_rq, struct task_struct *task, + const struct load_weight *lw); void (*prio_changed) (struct rq *this_rq, struct task_struct *task, int oldprio); @@ -2322,8 +2485,8 @@ struct sched_class { static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { - WARN_ON_ONCE(rq->curr != prev); - prev->sched_class->put_prev_task(rq, prev); + WARN_ON_ONCE(rq->donor != prev); + prev->sched_class->put_prev_task(rq, prev, NULL); } static inline void set_next_task(struct rq *rq, struct task_struct *next) @@ -2331,6 +2494,30 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next) next->sched_class->set_next_task(rq, next, false); } +static inline void +__put_prev_set_next_dl_server(struct rq *rq, + struct task_struct *prev, + struct task_struct *next) +{ + prev->dl_server = NULL; + next->dl_server = rq->dl_server; + rq->dl_server = NULL; +} + +static inline void put_prev_set_next_task(struct rq *rq, + struct task_struct *prev, + struct task_struct *next) +{ + WARN_ON_ONCE(rq->curr != prev); + + __put_prev_set_next_dl_server(rq, prev, next); + + if (next == prev) + return; + + prev->sched_class->put_prev_task(rq, prev, next); + next->sched_class->set_next_task(rq, next, true); +} /* * Helper to define a sched_class instance; each one is placed in a separate @@ -2351,19 +2538,41 @@ const struct sched_class name##_sched_class \ extern struct sched_class __sched_class_highest[]; extern struct sched_class __sched_class_lowest[]; +extern const struct sched_class stop_sched_class; +extern const struct sched_class dl_sched_class; +extern const struct sched_class rt_sched_class; +extern const struct sched_class fair_sched_class; +extern const struct sched_class idle_sched_class; + +/* + * Iterate only active classes. SCX can take over all fair tasks or be + * completely disabled. If the former, skip fair. If the latter, skip SCX. + */ +static inline const struct sched_class *next_active_class(const struct sched_class *class) +{ + class++; +#ifdef CONFIG_SCHED_CLASS_EXT + if (scx_switched_all() && class == &fair_sched_class) + class++; + if (!scx_enabled() && class == &ext_sched_class) + class++; +#endif + return class; +} + #define for_class_range(class, _from, _to) \ for (class = (_from); class < (_to); class++) #define for_each_class(class) \ for_class_range(class, __sched_class_highest, __sched_class_lowest) -#define sched_class_above(_a, _b) ((_a) < (_b)) +#define for_active_class_range(class, _from, _to) \ + for (class = (_from); class != (_to); class = next_active_class(class)) -extern const struct sched_class stop_sched_class; -extern const struct sched_class dl_sched_class; -extern const struct sched_class rt_sched_class; -extern const struct sched_class fair_sched_class; -extern const struct sched_class idle_sched_class; +#define for_each_active_class(class) \ + for_active_class_range(class, __sched_class_highest, __sched_class_lowest) + +#define sched_class_above(_a, _b) ((_a) < (_b)) static inline bool sched_stop_runnable(struct rq *rq) { @@ -2382,11 +2591,11 @@ static inline bool sched_rt_runnable(struct rq *rq) static inline bool sched_fair_runnable(struct rq *rq) { - return rq->cfs.nr_running > 0; + return rq->cfs.nr_queued > 0; } extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); -extern struct task_struct *pick_next_task_idle(struct rq *rq); +extern struct task_struct *pick_task_idle(struct rq *rq); #define SCA_CHECK 0x01 #define SCA_MIGRATE_DISABLE 0x02 @@ -2397,13 +2606,37 @@ extern struct task_struct *pick_next_task_idle(struct rq *rq); extern void update_group_capacity(struct sched_domain *sd, int cpu); -extern void trigger_load_balance(struct rq *rq); +extern void sched_balance_trigger(struct rq *rq); +extern int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx); extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx); +static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) +{ + /* When not in the task's cpumask, no point in looking further. */ + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + return false; + + /* Can @cpu run a user thread? */ + if (!(p->flags & PF_KTHREAD) && !task_cpu_possible(cpu, p)) + return false; + + return true; +} + +static inline cpumask_t *alloc_user_cpus_ptr(int node) +{ + /* + * See do_set_cpus_allowed() above for the rcu_head usage. + */ + int size = max_t(int, cpumask_size(), sizeof(struct rcu_head)); + + return kmalloc_node(size, GFP_KERNEL, node); +} + static inline struct task_struct *get_push_task(struct rq *rq) { - struct task_struct *p = rq->curr; + struct task_struct *p = rq->donor; lockdep_assert_rq_held(rq); @@ -2422,9 +2655,28 @@ static inline struct task_struct *get_push_task(struct rq *rq) extern int push_cpu_stop(void *arg); -#endif +#else /* !CONFIG_SMP: */ + +static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) +{ + return true; +} + +static inline int __set_cpus_allowed_ptr(struct task_struct *p, + struct affinity_context *ctx) +{ + return set_cpus_allowed_ptr(p, ctx->new_mask); +} + +static inline cpumask_t *alloc_user_cpus_ptr(int node) +{ + return NULL; +} + +#endif /* !CONFIG_SMP */ #ifdef CONFIG_CPU_IDLE + static inline void idle_set_state(struct rq *rq, struct cpuidle_state *idle_state) { @@ -2437,7 +2689,9 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) return rq->idle_state; } -#else + +#else /* !CONFIG_CPU_IDLE: */ + static inline void idle_set_state(struct rq *rq, struct cpuidle_state *idle_state) { @@ -2447,7 +2701,8 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) { return NULL; } -#endif + +#endif /* !CONFIG_CPU_IDLE */ extern void schedule_idle(void); asmlinkage void schedule_user(void); @@ -2460,12 +2715,10 @@ extern void init_sched_dl_class(void); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); -extern void reweight_task(struct task_struct *p, int prio); - extern void resched_curr(struct rq *rq); +extern void resched_curr_lazy(struct rq *rq); extern void resched_cpu(int cpu); -extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); @@ -2476,7 +2729,8 @@ extern void init_dl_entity(struct sched_dl_entity *dl_se); #define RATIO_SHIFT 8 #define MAX_BW_BITS (64 - BW_SHIFT) #define MAX_BW ((1ULL << MAX_BW_BITS) - 1) -unsigned long to_ratio(u64 period, u64 runtime); + +extern unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); extern void post_init_entity_util_avg(struct task_struct *p); @@ -2502,10 +2756,10 @@ static inline void sched_update_tick_dependency(struct rq *rq) else tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); } -#else +#else /* !CONFIG_NO_HZ_FULL: */ static inline int sched_tick_offload_init(void) { return 0; } static inline void sched_update_tick_dependency(struct rq *rq) { } -#endif +#endif /* !CONFIG_NO_HZ_FULL */ static inline void add_nr_running(struct rq *rq, unsigned count) { @@ -2517,10 +2771,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count) } #ifdef CONFIG_SMP - if (prev_nr < 2 && rq->nr_running >= 2) { - if (!READ_ONCE(rq->rd->overload)) - WRITE_ONCE(rq->rd->overload, 1); - } + if (prev_nr < 2 && rq->nr_running >= 2) + set_rd_overloaded(rq->rd, 1); #endif sched_update_tick_dependency(rq); @@ -2537,15 +2789,58 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) sched_update_tick_dependency(rq); } +static inline void __block_task(struct rq *rq, struct task_struct *p) +{ + if (p->sched_contributes_to_load) + rq->nr_uninterruptible++; + + if (p->in_iowait) { + atomic_inc(&rq->nr_iowait); + delayacct_blkio_start(); + } + + ASSERT_EXCLUSIVE_WRITER(p->on_rq); + + /* + * The moment this write goes through, ttwu() can swoop in and migrate + * this task, rendering our rq->__lock ineffective. + * + * __schedule() try_to_wake_up() + * LOCK rq->__lock LOCK p->pi_lock + * pick_next_task() + * pick_next_task_fair() + * pick_next_entity() + * dequeue_entities() + * __block_task() + * RELEASE p->on_rq = 0 if (p->on_rq && ...) + * break; + * + * ACQUIRE (after ctrl-dep) + * + * cpu = select_task_rq(); + * set_task_cpu(p, cpu); + * ttwu_queue() + * ttwu_do_activate() + * LOCK rq->__lock + * activate_task() + * STORE p->on_rq = 1 + * UNLOCK rq->__lock + * + * Callers must ensure to not reference @p after this -- we no longer + * own it. + */ + smp_store_release(&p->on_rq, 0); +} + extern void activate_task(struct rq *rq, struct task_struct *p, int flags); extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); #ifdef CONFIG_PREEMPT_RT -#define SCHED_NR_MIGRATE_BREAK 8 +# define SCHED_NR_MIGRATE_BREAK 8 #else -#define SCHED_NR_MIGRATE_BREAK 32 +# define SCHED_NR_MIGRATE_BREAK 32 #endif extern const_debug unsigned int sysctl_sched_nr_migrate; @@ -2594,9 +2889,9 @@ static inline int hrtick_enabled_dl(struct rq *rq) return hrtick_enabled(rq); } -void hrtick_start(struct rq *rq, u64 delay); +extern void hrtick_start(struct rq *rq, u64 delay); -#else +#else /* !CONFIG_SCHED_HRTICK: */ static inline int hrtick_enabled_fair(struct rq *rq) { @@ -2613,13 +2908,10 @@ static inline int hrtick_enabled(struct rq *rq) return 0; } -#endif /* CONFIG_SCHED_HRTICK */ +#endif /* !CONFIG_SCHED_HRTICK */ #ifndef arch_scale_freq_tick -static __always_inline -void arch_scale_freq_tick(void) -{ -} +static __always_inline void arch_scale_freq_tick(void) { } #endif #ifndef arch_scale_freq_capacity @@ -2656,13 +2948,13 @@ static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) #endif } #else -static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {} +static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) { } #endif -#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ -__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \ -static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \ -{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \ +#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ +__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \ +static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \ +{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \ _lock; return _t; } #ifdef CONFIG_SMP @@ -2716,7 +3008,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) return 1; } -#else +#else /* !CONFIG_PREEMPTION: */ /* * Unfair double_lock_balance: Optimizes throughput at the expense of * latency by eliminating extra atomic operations when the locks are @@ -2747,7 +3039,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) return 1; } -#endif /* CONFIG_PREEMPTION */ +#endif /* !CONFIG_PREEMPTION */ /* * double_lock_balance - lock the busiest runqueue, this_rq is locked already. @@ -2823,9 +3115,10 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) extern void set_rq_online (struct rq *rq); extern void set_rq_offline(struct rq *rq); + extern bool sched_smp_initialized; -#else /* CONFIG_SMP */ +#else /* !CONFIG_SMP: */ /* * double_rq_lock - safely lock two runqueues @@ -2859,7 +3152,7 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) __release(rq2->lock); } -#endif +#endif /* !CONFIG_SMP */ DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq, double_rq_lock(_T->lock, _T->lock2), @@ -2880,16 +3173,15 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); extern void resched_latency_warn(int cpu, u64 latency); -#ifdef CONFIG_NUMA_BALANCING -extern void -show_numa_stats(struct task_struct *p, struct seq_file *m); +# ifdef CONFIG_NUMA_BALANCING +extern void show_numa_stats(struct task_struct *p, struct seq_file *m); extern void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, - unsigned long tpf, unsigned long gsf, unsigned long gpf); -#endif /* CONFIG_NUMA_BALANCING */ -#else -static inline void resched_latency_warn(int cpu, u64 latency) {} -#endif /* CONFIG_SCHED_DEBUG */ + unsigned long tpf, unsigned long gsf, unsigned long gpf); +# endif /* CONFIG_NUMA_BALANCING */ +#else /* !CONFIG_SCHED_DEBUG: */ +static inline void resched_latency_warn(int cpu, u64 latency) { } +#endif /* !CONFIG_SCHED_DEBUG */ extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq); @@ -2899,12 +3191,13 @@ extern void cfs_bandwidth_usage_inc(void); extern void cfs_bandwidth_usage_dec(void); #ifdef CONFIG_NO_HZ_COMMON + #define NOHZ_BALANCE_KICK_BIT 0 #define NOHZ_STATS_KICK_BIT 1 #define NOHZ_NEWILB_KICK_BIT 2 #define NOHZ_NEXT_KICK_BIT 3 -/* Run rebalance_domains() */ +/* Run sched_balance_domains() */ #define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) /* Update blocked load */ #define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) @@ -2913,14 +3206,14 @@ extern void cfs_bandwidth_usage_dec(void); /* Update nohz.next_balance */ #define NOHZ_NEXT_KICK BIT(NOHZ_NEXT_KICK_BIT) -#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK) +#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK) -#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) +#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) extern void nohz_balance_exit_idle(struct rq *rq); -#else +#else /* !CONFIG_NO_HZ_COMMON: */ static inline void nohz_balance_exit_idle(struct rq *rq) { } -#endif +#endif /* !CONFIG_NO_HZ_COMMON */ #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) extern void nohz_run_idle_balance(int cpu); @@ -2928,7 +3221,36 @@ extern void nohz_run_idle_balance(int cpu); static inline void nohz_run_idle_balance(int cpu) { } #endif +#include "stats.h" + +#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) + +extern void __sched_core_account_forceidle(struct rq *rq); + +static inline void sched_core_account_forceidle(struct rq *rq) +{ + if (schedstat_enabled()) + __sched_core_account_forceidle(rq); +} + +extern void __sched_core_tick(struct rq *rq); + +static inline void sched_core_tick(struct rq *rq) +{ + if (sched_core_enabled(rq) && schedstat_enabled()) + __sched_core_tick(rq); +} + +#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */ + +static inline void sched_core_account_forceidle(struct rq *rq) { } + +static inline void sched_core_tick(struct rq *rq) { } + +#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */ + #ifdef CONFIG_IRQ_TIME_ACCOUNTING + struct irqtime { u64 total; u64 tick_delta; @@ -2937,6 +3259,12 @@ struct irqtime { }; DECLARE_PER_CPU(struct irqtime, cpu_irqtime); +extern int sched_clock_irqtime; + +static inline int irqtime_enabled(void) +{ + return sched_clock_irqtime; +} /* * Returns the irqtime minus the softirq time computed by ksoftirqd. @@ -2956,9 +3284,18 @@ static inline u64 irq_time_read(int cpu) return total; } + +#else + +static inline int irqtime_enabled(void) +{ + return 0; +} + #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ #ifdef CONFIG_CPU_FREQ + DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); /** @@ -2992,9 +3329,9 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) if (data) data->func(data, rq_clock(rq), flags); } -#else -static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} -#endif /* CONFIG_CPU_FREQ */ +#else /* !CONFIG_CPU_FREQ: */ +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { } +#endif /* !CONFIG_CPU_FREQ */ #ifdef arch_scale_freq_capacity # ifndef arch_scale_freq_invariant @@ -3005,6 +3342,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif #ifdef CONFIG_SMP + unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, unsigned long *min, unsigned long *max); @@ -3047,9 +3385,13 @@ static inline unsigned long cpu_util_rt(struct rq *rq) { return READ_ONCE(rq->avg_rt.util_avg); } -#endif + +#else /* !CONFIG_SMP */ +static inline bool update_other_load_avgs(struct rq *rq) { return false; } +#endif /* CONFIG_SMP */ #ifdef CONFIG_UCLAMP_TASK + unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); static inline unsigned long uclamp_rq_get(struct rq *rq, @@ -3096,9 +3438,40 @@ static inline bool uclamp_is_used(void) { return static_branch_likely(&sched_uclamp_used); } -#else /* CONFIG_UCLAMP_TASK */ -static inline unsigned long uclamp_eff_value(struct task_struct *p, - enum uclamp_id clamp_id) + +#define for_each_clamp_id(clamp_id) \ + for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++) + +extern unsigned int sysctl_sched_uclamp_util_min_rt_default; + + +static inline unsigned int uclamp_none(enum uclamp_id clamp_id) +{ + if (clamp_id == UCLAMP_MIN) + return 0; + return SCHED_CAPACITY_SCALE; +} + +/* Integer rounded range for each bucket */ +#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) + +static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) +{ + return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1); +} + +static inline void +uclamp_se_set(struct uclamp_se *uc_se, unsigned int value, bool user_defined) +{ + uc_se->value = value; + uc_se->bucket_id = uclamp_bucket_id(value); + uc_se->user_defined = user_defined; +} + +#else /* !CONFIG_UCLAMP_TASK: */ + +static inline unsigned long +uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) { if (clamp_id == UCLAMP_MIN) return 0; @@ -3113,8 +3486,8 @@ static inline bool uclamp_is_used(void) return false; } -static inline unsigned long uclamp_rq_get(struct rq *rq, - enum uclamp_id clamp_id) +static inline unsigned long +uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id) { if (clamp_id == UCLAMP_MIN) return 0; @@ -3122,8 +3495,8 @@ static inline unsigned long uclamp_rq_get(struct rq *rq, return SCHED_CAPACITY_SCALE; } -static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id, - unsigned int value) +static inline void +uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id, unsigned int value) { } @@ -3131,12 +3504,14 @@ static inline bool uclamp_rq_is_idle(struct rq *rq) { return false; } -#endif /* CONFIG_UCLAMP_TASK */ + +#endif /* !CONFIG_UCLAMP_TASK */ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ + static inline unsigned long cpu_util_irq(struct rq *rq) { - return rq->avg_irq.util_avg; + return READ_ONCE(rq->avg_irq.util_avg); } static inline @@ -3148,7 +3523,9 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned return util; } -#else + +#else /* !CONFIG_HAVE_SCHED_AVG_IRQ: */ + static inline unsigned long cpu_util_irq(struct rq *rq) { return 0; @@ -3159,7 +3536,10 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned { return util; } -#endif + +#endif /* !CONFIG_HAVE_SCHED_AVG_IRQ */ + +extern void __setparam_fair(struct task_struct *p, const struct sched_attr *attr); #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) @@ -3177,11 +3557,13 @@ extern struct cpufreq_governor schedutil_gov; #else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ #define perf_domain_span(pd) NULL + static inline bool sched_energy_enabled(void) { return false; } #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ #ifdef CONFIG_MEMBARRIER + /* * The scheduler provides memory barriers required by membarrier between: * - prior user-space memory accesses and store to rq->membarrier_state, @@ -3203,13 +3585,16 @@ static inline void membarrier_switch_mm(struct rq *rq, WRITE_ONCE(rq->membarrier_state, membarrier_state); } -#else + +#else /* !CONFIG_MEMBARRIER :*/ + static inline void membarrier_switch_mm(struct rq *rq, struct mm_struct *prev_mm, struct mm_struct *next_mm) { } -#endif + +#endif /* !CONFIG_MEMBARRIER */ #ifdef CONFIG_SMP static inline bool is_per_cpu_kthread(struct task_struct *p) @@ -3261,7 +3646,7 @@ static inline void __mm_cid_put(struct mm_struct *mm, int cid) * be held to transition to other states. * * State transitions synchronized with cmpxchg or try_cmpxchg need to be - * consistent across cpus, which prevents use of this_cpu_cmpxchg. + * consistent across CPUs, which prevents use of this_cpu_cmpxchg. */ static inline void mm_cid_put_lazy(struct task_struct *t) { @@ -3309,25 +3694,62 @@ static inline void mm_cid_put(struct mm_struct *mm) __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); } -static inline int __mm_cid_try_get(struct mm_struct *mm) +static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) { - struct cpumask *cpumask; - int cid; + struct cpumask *cidmask = mm_cidmask(mm); + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; + int cid, max_nr_cid, allowed_max_nr_cid; - cpumask = mm_cidmask(mm); /* + * After shrinking the number of threads or reducing the number + * of allowed cpus, reduce the value of max_nr_cid so expansion + * of cid allocation will preserve cache locality if the number + * of threads or allowed cpus increase again. + */ + max_nr_cid = atomic_read(&mm->max_nr_cid); + while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), + atomic_read(&mm->mm_users))), + max_nr_cid > allowed_max_nr_cid) { + /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */ + if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) { + max_nr_cid = allowed_max_nr_cid; + break; + } + } + /* Try to re-use recent cid. This improves cache locality. */ + cid = __this_cpu_read(pcpu_cid->recent_cid); + if (!mm_cid_is_unset(cid) && cid < max_nr_cid && + !cpumask_test_and_set_cpu(cid, cidmask)) + return cid; + /* + * Expand cid allocation if the maximum number of concurrency + * IDs allocated (max_nr_cid) is below the number cpus allowed + * and number of threads. Expanding cid allocation as much as + * possible improves cache locality. + */ + cid = max_nr_cid; + while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { + /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */ + if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) + continue; + if (!cpumask_test_and_set_cpu(cid, cidmask)) + return cid; + } + /* + * Find the first available concurrency id. * Retry finding first zero bit if the mask is temporarily * filled. This only happens during concurrent remote-clear * which owns a cid without holding a rq lock. */ for (;;) { - cid = cpumask_first_zero(cpumask); - if (cid < nr_cpu_ids) + cid = cpumask_first_zero(cidmask); + if (cid < READ_ONCE(mm->nr_cpus_allowed)) break; cpu_relax(); } - if (cpumask_test_and_set_cpu(cid, cpumask)) + if (cpumask_test_and_set_cpu(cid, cidmask)) return -1; + return cid; } @@ -3343,7 +3765,8 @@ static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm) WRITE_ONCE(pcpu_cid->time, rq->clock); } -static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) +static inline int __mm_cid_get(struct rq *rq, struct task_struct *t, + struct mm_struct *mm) { int cid; @@ -3353,13 +3776,13 @@ static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) * guarantee forward progress. */ if (!READ_ONCE(use_cid_lock)) { - cid = __mm_cid_try_get(mm); + cid = __mm_cid_try_get(t, mm); if (cid >= 0) goto end; raw_spin_lock(&cid_lock); } else { raw_spin_lock(&cid_lock); - cid = __mm_cid_try_get(mm); + cid = __mm_cid_try_get(t, mm); if (cid >= 0) goto unlock; } @@ -3379,7 +3802,7 @@ static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) * all newcoming allocations observe the use_cid_lock flag set. */ do { - cid = __mm_cid_try_get(mm); + cid = __mm_cid_try_get(t, mm); cpu_relax(); } while (cid < 0); /* @@ -3392,10 +3815,12 @@ unlock: raw_spin_unlock(&cid_lock); end: mm_cid_snapshot_time(rq, mm); + return cid; } -static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm) +static inline int mm_cid_get(struct rq *rq, struct task_struct *t, + struct mm_struct *mm) { struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; struct cpumask *cpumask; @@ -3412,8 +3837,10 @@ static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm) if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); } - cid = __mm_cid_get(rq, mm); + cid = __mm_cid_get(rq, t, mm); __this_cpu_write(pcpu_cid->cid, cid); + __this_cpu_write(pcpu_cid->recent_cid, cid); + return cid; } @@ -3445,13 +3872,19 @@ static inline void switch_mm_cid(struct rq *rq, * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu]. * Provide it here. */ - if (!prev->mm) // from kernel + if (!prev->mm) { // from kernel smp_mb(); - /* - * user -> user transition guarantees a memory barrier through - * switch_mm() when current->mm changes. If current->mm is - * unchanged, no barrier is needed. - */ + } else { // from user + /* + * user->user transition relies on an implicit + * memory barrier in switch_mm() when + * current->mm changes. If the architecture + * switch_mm() does not have an implicit memory + * barrier, it is emitted here. If current->mm + * is unchanged, no barrier is needed. + */ + smp_mb__after_switch_mm(); + } } if (prev->mm_cid_active) { mm_cid_snapshot_time(rq, prev->mm); @@ -3459,18 +3892,115 @@ static inline void switch_mm_cid(struct rq *rq, prev->mm_cid = -1; } if (next->mm_cid_active) - next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm); + next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm); } -#else +#else /* !CONFIG_SCHED_MM_CID: */ static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { } static inline void sched_mm_cid_migrate_from(struct task_struct *t) { } static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { } static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } static inline void init_sched_mm_cid(struct task_struct *t) { } -#endif +#endif /* !CONFIG_SCHED_MM_CID */ extern u64 avg_vruntime(struct cfs_rq *cfs_rq); extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); +#ifdef CONFIG_SMP +static inline +void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task) +{ + lockdep_assert_rq_held(src_rq); + lockdep_assert_rq_held(dst_rq); + + deactivate_task(src_rq, task, 0); + set_task_cpu(task, dst_rq->cpu); + activate_task(dst_rq, task, 0); +} + +static inline +bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu) +{ + if (!task_on_cpu(rq, p) && + cpumask_test_cpu(cpu, &p->cpus_mask)) + return true; + + return false; +} +#endif + +#ifdef CONFIG_RT_MUTEXES + +static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) +{ + if (pi_task) + prio = min(prio, pi_task->prio); + + return prio; +} + +static inline int rt_effective_prio(struct task_struct *p, int prio) +{ + struct task_struct *pi_task = rt_mutex_get_top_task(p); + + return __rt_effective_prio(pi_task, prio); +} + +#else /* !CONFIG_RT_MUTEXES: */ + +static inline int rt_effective_prio(struct task_struct *p, int prio) +{ + return prio; +} + +#endif /* !CONFIG_RT_MUTEXES */ + +extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi); +extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx); +extern const struct sched_class *__setscheduler_class(int policy, int prio); +extern void set_load_weight(struct task_struct *p, bool update_load); +extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); +extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags); + +extern void check_class_changing(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class); +extern void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio); + +#ifdef CONFIG_SMP +extern struct balance_callback *splice_balance_callbacks(struct rq *rq); +extern void balance_callbacks(struct rq *rq, struct balance_callback *head); +#else + +static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) +{ + return NULL; +} + +static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) +{ +} + +#endif + +#ifdef CONFIG_SCHED_CLASS_EXT +/* + * Used by SCX in the enable/disable paths to move tasks between sched_classes + * and establish invariants. + */ +struct sched_enq_and_set_ctx { + struct task_struct *p; + int queue_flags; + bool queued; + bool running; +}; + +void sched_deq_and_put_task(struct task_struct *p, int queue_flags, + struct sched_enq_and_set_ctx *ctx); +void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx); + +#endif /* CONFIG_SCHED_CLASS_EXT */ + +#include "ext.h" #endif /* _KERNEL_SCHED_SCHED_H */ |