diff options
Diffstat (limited to 'kernel/sched/stats.h')
| -rw-r--r-- | kernel/sched/stats.h | 320 |
1 files changed, 249 insertions, 71 deletions
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index d5710651043b..c903f1a42891 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -1,6 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _KERNEL_STATS_H +#define _KERNEL_STATS_H #ifdef CONFIG_SCHEDSTATS +extern struct static_key_false sched_schedstats; + /* * Expects runqueue lock to be held for atomicity of update */ @@ -24,76 +29,259 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) } static inline void -rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) +rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) { if (rq) rq->rq_sched_info.run_delay += delta; } -#define schedstat_enabled() static_branch_unlikely(&sched_schedstats) -#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) -#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) -#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) -#define schedstat_val(var) (var) -#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) - -#else /* !CONFIG_SCHEDSTATS */ -static inline void -rq_sched_info_arrive(struct rq *rq, unsigned long long delta) -{} -static inline void -rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) -{} +#define schedstat_enabled() static_branch_unlikely(&sched_schedstats) +#define __schedstat_inc(var) do { var++; } while (0) +#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) +#define __schedstat_add(var, amt) do { var += (amt); } while (0) +#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) +#define __schedstat_set(var, val) do { var = (val); } while (0) +#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) +#define schedstat_val(var) (var) +#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) + +void __update_stats_wait_start(struct rq *rq, struct task_struct *p, + struct sched_statistics *stats); + +void __update_stats_wait_end(struct rq *rq, struct task_struct *p, + struct sched_statistics *stats); +void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, + struct sched_statistics *stats); + static inline void -rq_sched_info_depart(struct rq *rq, unsigned long long delta) -{} -#define schedstat_enabled() 0 -#define schedstat_inc(var) do { } while (0) -#define schedstat_add(var, amt) do { } while (0) -#define schedstat_set(var, val) do { } while (0) -#define schedstat_val(var) 0 -#define schedstat_val_or_zero(var) 0 +check_schedstat_required(void) +{ + if (schedstat_enabled()) + return; + + /* Force schedstat enabled if a dependent tracepoint is active */ + if (trace_sched_stat_wait_enabled() || + trace_sched_stat_sleep_enabled() || + trace_sched_stat_iowait_enabled() || + trace_sched_stat_blocked_enabled() || + trace_sched_stat_runtime_enabled()) + printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, stat_blocked and stat_runtime require the kernel parameter schedstats=enable or kernel.sched_schedstats=1\n"); +} + +#else /* !CONFIG_SCHEDSTATS: */ + +static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { } +static inline void rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) { } +static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { } +# define schedstat_enabled() 0 +# define __schedstat_inc(var) do { } while (0) +# define schedstat_inc(var) do { } while (0) +# define __schedstat_add(var, amt) do { } while (0) +# define schedstat_add(var, amt) do { } while (0) +# define __schedstat_set(var, val) do { } while (0) +# define schedstat_set(var, val) do { } while (0) +# define schedstat_val(var) 0 +# define schedstat_val_or_zero(var) 0 + +# define __update_stats_wait_start(rq, p, stats) do { } while (0) +# define __update_stats_wait_end(rq, p, stats) do { } while (0) +# define __update_stats_enqueue_sleeper(rq, p, stats) do { } while (0) +# define check_schedstat_required() do { } while (0) + #endif /* CONFIG_SCHEDSTATS */ -#ifdef CONFIG_SCHED_INFO -static inline void sched_info_reset_dequeued(struct task_struct *t) +#ifdef CONFIG_FAIR_GROUP_SCHED +struct sched_entity_stats { + struct sched_entity se; + struct sched_statistics stats; +} __no_randomize_layout; +#endif + +static inline struct sched_statistics * +__schedstats_from_se(struct sched_entity *se) { - t->sched_info.last_queued = 0; +#ifdef CONFIG_FAIR_GROUP_SCHED + if (!entity_is_task(se)) + return &container_of(se, struct sched_entity_stats, se)->stats; +#endif + return &task_of(se)->stats; +} + +#ifdef CONFIG_PSI +void psi_task_change(struct task_struct *task, int clear, int set); +void psi_task_switch(struct task_struct *prev, struct task_struct *next, + bool sleep); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev); +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ +static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, + struct task_struct *prev) {} +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ +/* + * PSI tracks state that persists across sleeps, such as iowaits and + * memory stalls. As a result, it has to distinguish between sleeps, + * where a task's runnable state changes, and migrations, where a task + * and its runnable state are being moved between CPUs and runqueues. + * + * A notable case is a task whose dequeue is delayed. PSI considers + * those sleeping, but because they are still on the runqueue they can + * go through migration requeues. In this case, *sleeping* states need + * to be transferred. + */ +static inline void psi_enqueue(struct task_struct *p, int flags) +{ + int clear = 0, set = 0; + + if (static_branch_likely(&psi_disabled)) + return; + + /* Same runqueue, nothing changed for psi */ + if (flags & ENQUEUE_RESTORE) + return; + + /* psi_sched_switch() will handle the flags */ + if (task_on_cpu(task_rq(p), p)) + return; + + if (p->se.sched_delayed) { + /* CPU migration of "sleeping" task */ + WARN_ON_ONCE(!(flags & ENQUEUE_MIGRATED)); + if (p->in_memstall) + set |= TSK_MEMSTALL; + if (p->in_iowait) + set |= TSK_IOWAIT; + } else if (flags & ENQUEUE_MIGRATED) { + /* CPU migration of runnable task */ + set = TSK_RUNNING; + if (p->in_memstall) + set |= TSK_MEMSTALL | TSK_MEMSTALL_RUNNING; + } else { + /* Wakeup of new or sleeping task */ + if (p->in_iowait) + clear |= TSK_IOWAIT; + set = TSK_RUNNING; + if (p->in_memstall) + set |= TSK_MEMSTALL_RUNNING; + } + + psi_task_change(p, clear, set); +} + +static inline void psi_dequeue(struct task_struct *p, int flags) +{ + if (static_branch_likely(&psi_disabled)) + return; + + /* Same runqueue, nothing changed for psi */ + if (flags & DEQUEUE_SAVE) + return; + + /* + * A voluntary sleep is a dequeue followed by a task switch. To + * avoid walking all ancestors twice, psi_task_switch() handles + * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU. + * Do nothing here. + * + * In the SCHED_PROXY_EXECUTION case we may do sleeping + * dequeues that are not followed by a task switch, so check + * TSK_ONCPU is set to ensure the task switch is imminent. + * Otherwise clear the flags as usual. + */ + if ((flags & DEQUEUE_SLEEP) && (p->psi_flags & TSK_ONCPU)) + return; + + /* + * When migrating a task to another CPU, clear all psi + * state. The enqueue callback above will work it out. + */ + psi_task_change(p, p->psi_flags, 0); } +static inline void psi_ttwu_dequeue(struct task_struct *p) +{ + if (static_branch_likely(&psi_disabled)) + return; + /* + * Is the task being migrated during a wakeup? Make sure to + * deregister its sleep-persistent psi states from the old + * queue, and let psi_enqueue() know it has to requeue. + */ + if (unlikely(p->psi_flags)) { + struct rq_flags rf; + struct rq *rq; + + rq = __task_rq_lock(p, &rf); + psi_task_change(p, p->psi_flags, 0); + __task_rq_unlock(rq, p, &rf); + } +} + +static inline void psi_sched_switch(struct task_struct *prev, + struct task_struct *next, + bool sleep) +{ + if (static_branch_likely(&psi_disabled)) + return; + + psi_task_switch(prev, next, sleep); +} + +#else /* !CONFIG_PSI: */ +static inline void psi_enqueue(struct task_struct *p, bool migrate) {} +static inline void psi_dequeue(struct task_struct *p, bool migrate) {} +static inline void psi_ttwu_dequeue(struct task_struct *p) {} +static inline void psi_sched_switch(struct task_struct *prev, + struct task_struct *next, + bool sleep) {} +static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, + struct task_struct *prev) {} +#endif /* !CONFIG_PSI */ + +#ifdef CONFIG_SCHED_INFO /* * We are interested in knowing how long it was from the *first* time a - * task was queued to the time that it finally hit a cpu, we call this routine - * from dequeue_task() to account for possible rq->clock skew across cpus. The - * delta taken on each cpu would annul the skew. + * task was queued to the time that it finally hit a CPU, we call this routine + * from dequeue_task() to account for possible rq->clock skew across CPUs. The + * delta taken on each CPU would annul the skew. */ -static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) +static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t) { - unsigned long long now = rq_clock(rq), delta = 0; + unsigned long long delta = 0; - if (unlikely(sched_info_on())) - if (t->sched_info.last_queued) - delta = now - t->sched_info.last_queued; - sched_info_reset_dequeued(t); - t->sched_info.run_delay += delta; + if (!t->sched_info.last_queued) + return; - rq_sched_info_dequeued(rq, delta); + delta = rq_clock(rq) - t->sched_info.last_queued; + t->sched_info.last_queued = 0; + t->sched_info.run_delay += delta; + if (delta > t->sched_info.max_run_delay) + t->sched_info.max_run_delay = delta; + if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay)) + t->sched_info.min_run_delay = delta; + rq_sched_info_dequeue(rq, delta); } /* - * Called when a task finally hits the cpu. We can now calculate how + * Called when a task finally hits the CPU. We can now calculate how * long it was waiting to run. We also note when it began so that we - * can keep stats on how long its timeslice is. + * can keep stats on how long its time-slice is. */ static void sched_info_arrive(struct rq *rq, struct task_struct *t) { - unsigned long long now = rq_clock(rq), delta = 0; + unsigned long long now, delta = 0; - if (t->sched_info.last_queued) - delta = now - t->sched_info.last_queued; - sched_info_reset_dequeued(t); + if (!t->sched_info.last_queued) + return; + + now = rq_clock(rq); + delta = now - t->sched_info.last_queued; + t->sched_info.last_queued = 0; t->sched_info.run_delay += delta; t->sched_info.last_arrival = now; t->sched_info.pcount++; + if (delta > t->sched_info.max_run_delay) + t->sched_info.max_run_delay = delta; + if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay)) + t->sched_info.min_run_delay = delta; rq_sched_info_arrive(rq, delta); } @@ -101,13 +289,12 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) /* * This function is only called from enqueue_task(), but also only updates * the timestamp if it is already not set. It's assumed that - * sched_info_dequeued() will clear that stamp when appropriate. + * sched_info_dequeue() will clear that stamp when appropriate. */ -static inline void sched_info_queued(struct rq *rq, struct task_struct *t) +static inline void sched_info_enqueue(struct rq *rq, struct task_struct *t) { - if (unlikely(sched_info_on())) - if (!t->sched_info.last_queued) - t->sched_info.last_queued = rq_clock(rq); + if (!t->sched_info.last_queued) + t->sched_info.last_queued = rq_clock(rq); } /* @@ -115,18 +302,17 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t) * due, typically, to expiring its time slice (this may also be called when * switching to the idle task). Now we can calculate how long we ran. * Also, if the process is still in the TASK_RUNNING state, call - * sched_info_queued() to mark that it has now again started waiting on + * sched_info_enqueue() to mark that it has now again started waiting on * the runqueue. */ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) { - unsigned long long delta = rq_clock(rq) - - t->sched_info.last_arrival; + unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival; rq_sched_info_depart(rq, delta); - if (t->state == TASK_RUNNING) - sched_info_queued(rq, t); + if (task_is_running(t)) + sched_info_enqueue(rq, t); } /* @@ -135,11 +321,10 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) * the idle task.) We are only called when prev != next. */ static inline void -__sched_info_switch(struct rq *rq, - struct task_struct *prev, struct task_struct *next) +sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { /* - * prev now departs the cpu. It's not interesting to record + * prev now departs the CPU. It's not interesting to record * stats about how efficient we were at scheduling the idle * process, however. */ @@ -149,18 +334,11 @@ __sched_info_switch(struct rq *rq, if (next != rq->idle) sched_info_arrive(rq, next); } -static inline void -sched_info_switch(struct rq *rq, - struct task_struct *prev, struct task_struct *next) -{ - if (unlikely(sched_info_on())) - __sched_info_switch(rq, prev, next); -} -#else -#define sched_info_queued(rq, t) do { } while (0) -#define sched_info_reset_dequeued(t) do { } while (0) -#define sched_info_dequeued(rq, t) do { } while (0) -#define sched_info_depart(rq, t) do { } while (0) -#define sched_info_arrive(rq, next) do { } while (0) -#define sched_info_switch(rq, t, next) do { } while (0) -#endif /* CONFIG_SCHED_INFO */ + +#else /* !CONFIG_SCHED_INFO: */ +# define sched_info_enqueue(rq, t) do { } while (0) +# define sched_info_dequeue(rq, t) do { } while (0) +# define sched_info_switch(rq, t, next) do { } while (0) +#endif /* !CONFIG_SCHED_INFO */ + +#endif /* _KERNEL_STATS_H */ |
