diff options
Diffstat (limited to 'include/linux/sched.h')
| -rw-r--r-- | include/linux/sched.h | 1693 |
1 files changed, 1104 insertions, 589 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index d2f90fa92468..d395f2810fac 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -10,86 +10,131 @@ #include <uapi/linux/sched.h> #include <asm/current.h> - -#include <linux/pid.h> -#include <linux/sem.h> +#include <asm/processor.h> +#include <linux/thread_info.h> +#include <linux/preempt.h> +#include <linux/cpumask_types.h> + +#include <linux/cache.h> +#include <linux/irqflags_types.h> +#include <linux/smp_types.h> +#include <linux/pid_types.h> +#include <linux/sem_types.h> #include <linux/shm.h> -#include <linux/kcov.h> -#include <linux/mutex.h> -#include <linux/plist.h> -#include <linux/hrtimer.h> -#include <linux/seccomp.h> -#include <linux/nodemask.h> -#include <linux/rcupdate.h> +#include <linux/kmsan_types.h> +#include <linux/mutex_types.h> +#include <linux/plist_types.h> +#include <linux/hrtimer_types.h> +#include <linux/timer_types.h> +#include <linux/seccomp_types.h> +#include <linux/nodemask_types.h> +#include <linux/refcount_types.h> #include <linux/resource.h> #include <linux/latencytop.h> #include <linux/sched/prio.h> +#include <linux/sched/types.h> #include <linux/signal_types.h> -#include <linux/psi_types.h> +#include <linux/spinlock.h> +#include <linux/syscall_user_dispatch_types.h> #include <linux/mm_types_task.h> +#include <linux/netdevice_xmit.h> #include <linux/task_io_accounting.h> -#include <linux/rseq.h> +#include <linux/posix-timers_types.h> +#include <linux/restart_block.h> +#include <linux/rseq_types.h> +#include <linux/seqlock_types.h> +#include <linux/kcsan.h> +#include <linux/rv.h> +#include <linux/uidgid_types.h> +#include <linux/tracepoint-defs.h> +#include <linux/unwind_deferred_types.h> +#include <asm/kmap_size.h> +#ifndef COMPILE_OFFSETS +#include <generated/rq-offsets.h> +#endif /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; -struct backing_dev_info; struct bio_list; struct blk_plug; +struct bpf_local_storage; +struct bpf_run_ctx; +struct bpf_net_context; +struct capture_control; struct cfs_rq; struct fs_struct; struct futex_pi_state; struct io_context; +struct io_uring_task; struct mempolicy; struct nameidata; struct nsproxy; struct perf_event_context; +struct perf_ctx_data; struct pid_namespace; struct pipe_inode_info; struct rcu_node; struct reclaim_state; struct robust_list_head; +struct root_domain; +struct rq; struct sched_attr; -struct sched_param; +struct sched_dl_entity; struct seq_file; struct sighand_struct; struct signal_struct; struct task_delay_info; struct task_group; +struct task_struct; +struct user_event_mm; + +#include <linux/sched/ext.h> /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). * - * We have two separate sets of flags: task->state + * We have two separate sets of flags: task->__state * is about runnability, while task->exit_state are * about the task exiting. Confusing, but this way * modifying one set can't modify the other one by * mistake. */ -/* Used in tsk->state: */ -#define TASK_RUNNING 0x0000 -#define TASK_INTERRUPTIBLE 0x0001 -#define TASK_UNINTERRUPTIBLE 0x0002 -#define __TASK_STOPPED 0x0004 -#define __TASK_TRACED 0x0008 +/* Used in tsk->__state: */ +#define TASK_RUNNING 0x00000000 +#define TASK_INTERRUPTIBLE 0x00000001 +#define TASK_UNINTERRUPTIBLE 0x00000002 +#define __TASK_STOPPED 0x00000004 +#define __TASK_TRACED 0x00000008 /* Used in tsk->exit_state: */ -#define EXIT_DEAD 0x0010 -#define EXIT_ZOMBIE 0x0020 +#define EXIT_DEAD 0x00000010 +#define EXIT_ZOMBIE 0x00000020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) -/* Used in tsk->state again: */ -#define TASK_PARKED 0x0040 -#define TASK_DEAD 0x0080 -#define TASK_WAKEKILL 0x0100 -#define TASK_WAKING 0x0200 -#define TASK_NOLOAD 0x0400 -#define TASK_NEW 0x0800 -#define TASK_STATE_MAX 0x1000 +/* Used in tsk->__state again: */ +#define TASK_PARKED 0x00000040 +#define TASK_DEAD 0x00000080 +#define TASK_WAKEKILL 0x00000100 +#define TASK_WAKING 0x00000200 +#define TASK_NOLOAD 0x00000400 +#define TASK_NEW 0x00000800 +#define TASK_RTLOCK_WAIT 0x00001000 +#define TASK_FREEZABLE 0x00002000 +#define __TASK_FREEZABLE_UNSAFE (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP)) +#define TASK_FROZEN 0x00008000 +#define TASK_STATE_MAX 0x00010000 + +#define TASK_ANY (TASK_STATE_MAX-1) + +/* + * DO NOT ADD ANY NEW USERS ! + */ +#define TASK_FREEZABLE_UNSAFE (TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE) /* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) -#define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED) +#define TASK_TRACED __TASK_TRACED #define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD) @@ -102,76 +147,84 @@ struct task_group; __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ TASK_PARKED) -#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) - -#define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) - -#define task_is_stopped_or_traced(task) ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) +#define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) -#define task_contributes_to_load(task) ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ - (task->flags & PF_FROZEN) == 0 && \ - (task->state & TASK_NOLOAD) == 0) - -#ifdef CONFIG_DEBUG_ATOMIC_SLEEP +#define task_is_traced(task) ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0) +#define task_is_stopped(task) ((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0) +#define task_is_stopped_or_traced(task) ((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0) /* * Special states are those that do not use the normal wait-loop pattern. See * the comment with set_special_state(). */ -#define is_special_task_state(state) \ - ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD)) - -#define __set_current_state(state_value) \ - do { \ - WARN_ON_ONCE(is_special_task_state(state_value));\ - current->task_state_change = _THIS_IP_; \ - current->state = (state_value); \ - } while (0) +#define is_special_task_state(state) \ + ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | \ + TASK_DEAD | TASK_FROZEN)) -#define set_current_state(state_value) \ - do { \ - WARN_ON_ONCE(is_special_task_state(state_value));\ - current->task_state_change = _THIS_IP_; \ - smp_store_mb(current->state, (state_value)); \ +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP +# define debug_normal_state_change(state_value) \ + do { \ + WARN_ON_ONCE(is_special_task_state(state_value)); \ + current->task_state_change = _THIS_IP_; \ } while (0) -#define set_special_state(state_value) \ +# define debug_special_state_change(state_value) \ do { \ - unsigned long flags; /* may shadow */ \ WARN_ON_ONCE(!is_special_task_state(state_value)); \ - raw_spin_lock_irqsave(¤t->pi_lock, flags); \ current->task_state_change = _THIS_IP_; \ - current->state = (state_value); \ - raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \ } while (0) + +# define debug_rtlock_wait_set_state() \ + do { \ + current->saved_state_change = current->task_state_change;\ + current->task_state_change = _THIS_IP_; \ + } while (0) + +# define debug_rtlock_wait_restore_state() \ + do { \ + current->task_state_change = current->saved_state_change;\ + } while (0) + #else +# define debug_normal_state_change(cond) do { } while (0) +# define debug_special_state_change(cond) do { } while (0) +# define debug_rtlock_wait_set_state() do { } while (0) +# define debug_rtlock_wait_restore_state() do { } while (0) +#endif + +#define trace_set_current_state(state_value) \ + do { \ + if (tracepoint_enabled(sched_set_state_tp)) \ + __trace_set_current_state(state_value); \ + } while (0) + /* - * set_current_state() includes a barrier so that the write of current->state + * set_current_state() includes a barrier so that the write of current->__state * is correctly serialised wrt the caller's subsequent test of whether to * actually sleep: * * for (;;) { * set_current_state(TASK_UNINTERRUPTIBLE); - * if (!need_sleep) - * break; + * if (CONDITION) + * break; * * schedule(); * } * __set_current_state(TASK_RUNNING); * * If the caller does not need such serialisation (because, for instance, the - * condition test and condition change and wakeup are under the same lock) then + * CONDITION test and condition change and wakeup are under the same lock) then * use __set_current_state(). * * The above is typically ordered against the wakeup, which does: * - * need_sleep = false; + * CONDITION = 1; * wake_up_state(p, TASK_UNINTERRUPTIBLE); * - * where wake_up_state() executes a full memory barrier before accessing the - * task state. + * where wake_up_state()/try_to_wake_up() executes a full memory barrier before + * accessing p->__state. * - * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is, + * Wakeup will do: if (@state & p->__state) p->__state = TASK_RUNNING, that is, * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). * @@ -183,31 +236,94 @@ struct task_group; * Also see the comments of try_to_wake_up(). */ #define __set_current_state(state_value) \ - current->state = (state_value) + do { \ + debug_normal_state_change((state_value)); \ + trace_set_current_state(state_value); \ + WRITE_ONCE(current->__state, (state_value)); \ + } while (0) #define set_current_state(state_value) \ - smp_store_mb(current->state, (state_value)) + do { \ + debug_normal_state_change((state_value)); \ + trace_set_current_state(state_value); \ + smp_store_mb(current->__state, (state_value)); \ + } while (0) /* * set_special_state() should be used for those states when the blocking task * can not use the regular condition based wait-loop. In that case we must - * serialize against wakeups such that any possible in-flight TASK_RUNNING stores - * will not collide with our state change. + * serialize against wakeups such that any possible in-flight TASK_RUNNING + * stores will not collide with our state change. */ #define set_special_state(state_value) \ do { \ unsigned long flags; /* may shadow */ \ + \ raw_spin_lock_irqsave(¤t->pi_lock, flags); \ - current->state = (state_value); \ + debug_special_state_change((state_value)); \ + trace_set_current_state(state_value); \ + WRITE_ONCE(current->__state, (state_value)); \ raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \ } while (0) -#endif +/* + * PREEMPT_RT specific variants for "sleeping" spin/rwlocks + * + * RT's spin/rwlock substitutions are state preserving. The state of the + * task when blocking on the lock is saved in task_struct::saved_state and + * restored after the lock has been acquired. These operations are + * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT + * lock related wakeups while the task is blocked on the lock are + * redirected to operate on task_struct::saved_state to ensure that these + * are not dropped. On restore task_struct::saved_state is set to + * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail. + * + * The lock operation looks like this: + * + * current_save_and_set_rtlock_wait_state(); + * for (;;) { + * if (try_lock()) + * break; + * raw_spin_unlock_irq(&lock->wait_lock); + * schedule_rtlock(); + * raw_spin_lock_irq(&lock->wait_lock); + * set_current_state(TASK_RTLOCK_WAIT); + * } + * current_restore_rtlock_saved_state(); + */ +#define current_save_and_set_rtlock_wait_state() \ + do { \ + lockdep_assert_irqs_disabled(); \ + raw_spin_lock(¤t->pi_lock); \ + current->saved_state = current->__state; \ + debug_rtlock_wait_set_state(); \ + trace_set_current_state(TASK_RTLOCK_WAIT); \ + WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \ + raw_spin_unlock(¤t->pi_lock); \ + } while (0); + +#define current_restore_rtlock_saved_state() \ + do { \ + lockdep_assert_irqs_disabled(); \ + raw_spin_lock(¤t->pi_lock); \ + debug_rtlock_wait_restore_state(); \ + trace_set_current_state(current->saved_state); \ + WRITE_ONCE(current->__state, current->saved_state); \ + current->saved_state = TASK_RUNNING; \ + raw_spin_unlock(¤t->pi_lock); \ + } while (0); -/* Task command name length: */ -#define TASK_COMM_LEN 16 +#define get_current_state() READ_ONCE(current->__state) -extern void scheduler_tick(void); +/* + * Define the task command name length as enum, then it can be visible to + * BPF programs. + */ +enum { + TASK_COMM_LEN = 16, +}; + +extern void sched_tick(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX @@ -218,12 +334,22 @@ extern long schedule_timeout_uninterruptible(long timeout); extern long schedule_timeout_idle(long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); +asmlinkage void preempt_schedule_irq(void); +#ifdef CONFIG_PREEMPT_RT + extern void schedule_rtlock(void); +#endif extern int __must_check io_schedule_prepare(void); extern void io_schedule_finish(int token); extern long io_schedule_timeout(long timeout); extern void io_schedule(void); +/* wrapper functions to trace from this header file */ +DECLARE_TRACEPOINT(sched_set_state_tp); +extern void __trace_set_current_state(int state_value); +DECLARE_TRACEPOINT(sched_set_need_resched_tp); +extern void __trace_set_need_resched(struct task_struct *curr, int tif); + /** * struct prev_cputime - snapshot of system and user cputime * @utime: time spent in user mode @@ -241,45 +367,50 @@ struct prev_cputime { #endif }; -/** - * struct task_cputime - collected CPU time counts - * @utime: time spent in user mode, in nanoseconds - * @stime: time spent in kernel mode, in nanoseconds - * @sum_exec_runtime: total time spent on the CPU, in nanoseconds - * - * This structure groups together three kinds of CPU time that are tracked for - * threads and thread groups. Most things considering CPU time want to group - * these counts together and treat all three of them in parallel. - */ -struct task_cputime { - u64 utime; - u64 stime; - unsigned long long sum_exec_runtime; -}; - -/* Alternate field names when used on cache expirations: */ -#define virt_exp utime -#define prof_exp stime -#define sched_exp sum_exec_runtime - enum vtime_state { /* Task is sleeping or running in a CPU with VTIME inactive: */ VTIME_INACTIVE = 0, - /* Task runs in userspace in a CPU with VTIME active: */ - VTIME_USER, + /* Task is idle */ + VTIME_IDLE, /* Task runs in kernelspace in a CPU with VTIME active: */ VTIME_SYS, + /* Task runs in userspace in a CPU with VTIME active: */ + VTIME_USER, + /* Task runs as guests in a CPU with VTIME active: */ + VTIME_GUEST, }; struct vtime { seqcount_t seqcount; unsigned long long starttime; enum vtime_state state; + unsigned int cpu; u64 utime; u64 stime; u64 gtime; }; +/* + * Utilization clamp constraints. + * @UCLAMP_MIN: Minimum utilization + * @UCLAMP_MAX: Maximum utilization + * @UCLAMP_CNT: Utilization clamp constraints count + */ +enum uclamp_id { + UCLAMP_MIN = 0, + UCLAMP_MAX, + UCLAMP_CNT +}; + +extern struct root_domain def_root_domain; +extern struct mutex sched_domains_mutex; +extern void sched_domains_mutex_lock(void); +extern void sched_domains_mutex_unlock(void); + +struct sched_param { + int sched_priority; +}; + struct sched_info { #ifdef CONFIG_SCHED_INFO /* Cumulative counters: */ @@ -290,6 +421,12 @@ struct sched_info { /* Time spent waiting on a runqueue: */ unsigned long long run_delay; + /* Max time spent waiting on a runqueue: */ + unsigned long long max_run_delay; + + /* Min time spent waiting on a runqueue: */ + unsigned long long min_run_delay; + /* Timestamps: */ /* When did we last run on a CPU? */ @@ -311,76 +448,45 @@ struct sched_info { # define SCHED_FIXEDPOINT_SHIFT 10 # define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) +/* Increase resolution of cpu_capacity calculations */ +# define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT +# define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) + struct load_weight { unsigned long weight; u32 inv_weight; }; -/** - * struct util_est - Estimation utilization of FAIR tasks - * @enqueued: instantaneous estimated utilization of a task/cpu - * @ewma: the Exponential Weighted Moving Average (EWMA) - * utilization of a task - * - * Support data structure to track an Exponential Weighted Moving Average - * (EWMA) of a FAIR task's utilization. New samples are added to the moving - * average each time a task completes an activation. Sample's weight is chosen - * so that the EWMA will be relatively insensitive to transient changes to the - * task's workload. - * - * The enqueued attribute has a slightly different meaning for tasks and cpus: - * - task: the task's util_avg at last task dequeue time - * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU - * Thus, the util_est.enqueued of a task represents the contribution on the - * estimated utilization of the CPU where that task is currently enqueued. - * - * Only for tasks we track a moving average of the past instantaneous - * estimated utilization. This allows to absorb sporadic drops in utilization - * of an otherwise almost periodic task. - */ -struct util_est { - unsigned int enqueued; - unsigned int ewma; -#define UTIL_EST_WEIGHT_SHIFT 2 -} __attribute__((__aligned__(sizeof(u64)))); - /* - * The load_avg/util_avg accumulates an infinite geometric series - * (see __update_load_avg() in kernel/sched/fair.c). + * The load/runnable/util_avg accumulates an infinite geometric series + * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). * * [load_avg definition] * * load_avg = runnable% * scale_load_down(load) * - * where runnable% is the time ratio that a sched_entity is runnable. - * For cfs_rq, it is the aggregated load_avg of all runnable and - * blocked sched_entities. + * [runnable_avg definition] * - * load_avg may also take frequency scaling into account: - * - * load_avg = runnable% * scale_load_down(load) * freq% - * - * where freq% is the CPU frequency normalized to the highest frequency. + * runnable_avg = runnable% * SCHED_CAPACITY_SCALE * * [util_avg definition] * * util_avg = running% * SCHED_CAPACITY_SCALE * - * where running% is the time ratio that a sched_entity is running on - * a CPU. For cfs_rq, it is the aggregated util_avg of all runnable - * and blocked sched_entities. + * where runnable% is the time ratio that a sched_entity is runnable and + * running% the time ratio that a sched_entity is running. * - * util_avg may also factor frequency scaling and CPU capacity scaling: + * For cfs_rq, they are the aggregated values of all runnable and blocked + * sched_entities. * - * util_avg = running% * SCHED_CAPACITY_SCALE * freq% * capacity% + * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU + * capacity scaling. The scaling is done through the rq_clock_pelt that is used + * for computing those signals (see update_rq_clock_pelt()) * - * where freq% is the same as above, and capacity% is the CPU capacity - * normalized to the greatest capacity (due to uarch differences, etc). - * - * N.B., the above ratios (runnable%, running%, freq%, and capacity%) - * themselves are in the range of [0, 1]. To do fixed point arithmetics, - * we therefore scale them to as large a range as necessary. This is for - * example reflected by util_avg's SCHED_CAPACITY_SCALE. + * N.B., the above ratios (runnable% and running%) themselves are in the + * range of [0, 1]. To do fixed point arithmetics, we therefore scale them + * to as large a range as necessary. This is for example reflected by + * util_avg's SCHED_CAPACITY_SCALE. * * [Overflow issue] * @@ -399,15 +505,26 @@ struct util_est { struct sched_avg { u64 last_update_time; u64 load_sum; - u64 runnable_load_sum; + u64 runnable_sum; u32 util_sum; u32 period_contrib; unsigned long load_avg; - unsigned long runnable_load_avg; + unsigned long runnable_avg; unsigned long util_avg; - struct util_est util_est; + unsigned int util_est; } ____cacheline_aligned; +/* + * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg + * updates. When a task is dequeued, its util_est should not be updated if its + * util_avg has not been updated in the meantime. + * This information is mapped into the MSB bit of util_est at dequeue time. + * Since max value of util_est for a task is 1024 (PELT util_avg for a task) + * it is safe to use MSB. + */ +#define UTIL_EST_WEIGHT_SHIFT 2 +#define UTIL_AVG_UNCHANGED 0x80000000 + struct sched_statistics { #ifdef CONFIG_SCHEDSTATS u64 wait_start; @@ -423,7 +540,9 @@ struct sched_statistics { u64 block_start; u64 block_max; - u64 exec_max; + s64 sum_block_runtime; + + s64 exec_max; u64 slice_max; u64 nr_migrations_cold; @@ -441,26 +560,45 @@ struct sched_statistics { u64 nr_wakeups_affine_attempts; u64 nr_wakeups_passive; u64 nr_wakeups_idle; + +#ifdef CONFIG_SCHED_CORE + u64 core_forceidle_sum; #endif -}; +#endif /* CONFIG_SCHEDSTATS */ +} ____cacheline_aligned; struct sched_entity { /* For load-balancing: */ struct load_weight load; - unsigned long runnable_weight; struct rb_node run_node; + u64 deadline; + u64 min_vruntime; + u64 min_slice; + struct list_head group_node; - unsigned int on_rq; + unsigned char on_rq; + unsigned char sched_delayed; + unsigned char rel_deadline; + unsigned char custom_slice; + /* hole */ u64 exec_start; u64 sum_exec_runtime; - u64 vruntime; u64 prev_sum_exec_runtime; + u64 vruntime; + union { + /* + * When !@on_rq this field is vlag. + * When cfs_rq->curr == se (which implies @on_rq) + * this field is vprot. See protect_slice(). + */ + s64 vlag; + u64 vprot; + }; + u64 slice; u64 nr_migrations; - struct sched_statistics statistics; - #ifdef CONFIG_FAIR_GROUP_SCHED int depth; struct sched_entity *parent; @@ -468,9 +606,10 @@ struct sched_entity { struct cfs_rq *cfs_rq; /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; + /* cached value of my_q->h_nr_running */ + unsigned long runnable_weight; #endif -#ifdef CONFIG_SMP /* * Per entity load average tracking. * @@ -478,7 +617,6 @@ struct sched_entity { * collide with read-mostly values above. */ struct sched_avg avg; -#endif }; struct sched_rt_entity { @@ -499,6 +637,9 @@ struct sched_rt_entity { #endif } __randomize_layout; +struct rq_flags; +typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *, struct rq_flags *rf); + struct sched_dl_entity { struct rb_node rb_node; @@ -529,10 +670,6 @@ struct sched_dl_entity { * task has to wait for a replenishment to be performed at the * next firing of dl_timer. * - * @dl_boosted tells if we are boosted due to DI. If so we are - * outside bandwidth enforcement mechanism (but only until we - * exit the critical section); - * * @dl_yielded tells if task gave up the CPU before consuming * all its available runtime during the last job. * @@ -545,12 +682,36 @@ struct sched_dl_entity { * * @dl_overrun tells if the task asked to be informed about runtime * overruns. + * + * @dl_server tells if this is a server entity. + * + * @dl_server_active tells if the dlserver is active(started). + * dlserver is started on first cfs enqueue on an idle runqueue + * and is stopped when a dequeue results in 0 cfs tasks on the + * runqueue. In other words, dlserver is active only when cpu's + * runqueue has atleast one cfs task. + * + * @dl_defer tells if this is a deferred or regular server. For + * now only defer server exists. + * + * @dl_defer_armed tells if the deferrable server is waiting + * for the replenishment timer to activate it. + * + * @dl_defer_running tells if the deferrable server is actually + * running, skipping the defer phase. + * + * @dl_defer_idle tracks idle state */ unsigned int dl_throttled : 1; - unsigned int dl_boosted : 1; unsigned int dl_yielded : 1; unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; + unsigned int dl_server : 1; + unsigned int dl_server_active : 1; + unsigned int dl_defer : 1; + unsigned int dl_defer_armed : 1; + unsigned int dl_defer_running : 1; + unsigned int dl_defer_idle : 1; /* * Bandwidth enforcement timer. Each -deadline task has its @@ -565,15 +726,68 @@ struct sched_dl_entity { * timer is needed to decrease the active utilization at the correct * time. */ - struct hrtimer inactive_timer; + struct hrtimer inactive_timer; + + /* + * Bits for DL-server functionality. Also see the comment near + * dl_server_update(). + * + * @rq the runqueue this server is for + */ + struct rq *rq; + dl_server_pick_f server_pick_task; + +#ifdef CONFIG_RT_MUTEXES + /* + * Priority Inheritance. When a DEADLINE scheduling entity is boosted + * pi_se points to the donor, otherwise points to the dl_se it belongs + * to (the original one/itself). + */ + struct sched_dl_entity *pi_se; +#endif }; +#ifdef CONFIG_UCLAMP_TASK +/* Number of utilization clamp buckets (shorter alias) */ +#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT + +/* + * Utilization clamp for a scheduling entity + * @value: clamp value "assigned" to a se + * @bucket_id: bucket index corresponding to the "assigned" value + * @active: the se is currently refcounted in a rq's bucket + * @user_defined: the requested clamp value comes from user-space + * + * The bucket_id is the index of the clamp bucket matching the clamp value + * which is pre-computed and stored to avoid expensive integer divisions from + * the fast path. + * + * The active bit is set whenever a task has got an "effective" value assigned, + * which can be different from the clamp value "requested" from user-space. + * This allows to know a task is refcounted in the rq's bucket corresponding + * to the "effective" bucket_id. + * + * The user_defined bit is set whenever a task has got a task-specific clamp + * value requested from userspace, i.e. the system defaults apply to this task + * just as a restriction. This allows to relax default clamps when a less + * restrictive task-specific value has been requested, thus allowing to + * implement a "nice" semantic. For example, a task running with a 20% + * default boost can still drop its own boosting to 0%. + */ +struct uclamp_se { + unsigned int value : bits_per(SCHED_CAPACITY_SCALE); + unsigned int bucket_id : bits_per(UCLAMP_BUCKETS); + unsigned int active : 1; + unsigned int user_defined : 1; +}; +#endif /* CONFIG_UCLAMP_TASK */ + union rcu_special { struct { u8 blocked; u8 need_qs; u8 exp_hint; /* Hint for performance. */ - u8 pad; /* No garbage from compiler! */ + u8 need_mb; /* Readers need smp_mb(). */ } b; /* Bits. */ u32 s; /* Set of bits. */ }; @@ -585,10 +799,23 @@ enum perf_event_task_context { perf_nr_task_contexts, }; +/* + * Number of contexts where an event can trigger: + * task, softirq, hardirq, nmi. + */ +#define PERF_NR_CONTEXTS 4 + struct wake_q_node { struct wake_q_node *next; }; +struct kmap_ctrl { +#ifdef CONFIG_KMAP_LOCAL + int idx; + pte_t pteval[KM_MAX_IDX]; +#endif +}; + struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* @@ -597,8 +824,10 @@ struct task_struct { */ struct thread_info thread_info; #endif - /* -1 unrunnable, 0 runnable, >0 stopped: */ - volatile long state; + unsigned int __state; + + /* saved state for "spinlock sleepers" */ + unsigned int saved_state; /* * This begins the randomizable portion of task_struct. Only @@ -607,18 +836,17 @@ struct task_struct { randomized_struct_fields_start void *stack; - atomic_t usage; + refcount_t usage; /* Per task flags (PF_*), defined further below: */ unsigned int flags; unsigned int ptrace; -#ifdef CONFIG_SMP - struct llist_node wake_entry; - int on_cpu; -#ifdef CONFIG_THREAD_INFO_IN_TASK - /* Current CPU: */ - unsigned int cpu; +#ifdef CONFIG_MEM_ALLOC_PROFILING + struct alloc_tag *alloc_tag; #endif + + int on_cpu; + struct __call_single_node wake_entry; unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; struct task_struct *last_wakee; @@ -632,7 +860,6 @@ struct task_struct { */ int recent_used_cpu; int wake_cpu; -#endif int on_rq; int prio; @@ -640,13 +867,45 @@ struct task_struct { int normal_prio; unsigned int rt_priority; - const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; + struct sched_dl_entity dl; + struct sched_dl_entity *dl_server; +#ifdef CONFIG_SCHED_CLASS_EXT + struct sched_ext_entity scx; +#endif + const struct sched_class *sched_class; + +#ifdef CONFIG_SCHED_CORE + struct rb_node core_node; + unsigned long core_cookie; + unsigned int core_occupation; +#endif + #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; +#ifdef CONFIG_CFS_BANDWIDTH + struct callback_head sched_throttle_work; + struct list_head throttle_node; + bool throttled; +#endif #endif - struct sched_dl_entity dl; + + +#ifdef CONFIG_UCLAMP_TASK + /* + * Clamp values requested for a scheduling entity. + * Must be updated with task_rq_lock() held. + */ + struct uclamp_se uclamp_req[UCLAMP_CNT]; + /* + * Effective clamp values used for a scheduling entity. + * Must be updated with task_rq_lock() held. + */ + struct uclamp_se uclamp[UCLAMP_CNT]; +#endif + + struct sched_statistics stats; #ifdef CONFIG_PREEMPT_NOTIFIERS /* List of struct preempt_notifier: */ @@ -658,8 +917,14 @@ struct task_struct { #endif unsigned int policy; + unsigned long max_allowed_capacity; int nr_cpus_allowed; - cpumask_t cpus_allowed; + const cpumask_t *cpus_ptr; + cpumask_t *user_cpus_ptr; + cpumask_t cpus_mask; + void *migration_pending; + unsigned short migration_disabled; + unsigned short migration_flags; #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; @@ -674,25 +939,29 @@ struct task_struct { u8 rcu_tasks_idx; int rcu_tasks_idle_cpu; struct list_head rcu_tasks_holdout_list; + int rcu_tasks_exit_cpu; + struct list_head rcu_tasks_exit_list; #endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_TRACE_RCU + int trc_reader_nesting; + int trc_ipi_to_cpu; + union rcu_special trc_reader_special; + struct list_head trc_holdout_list; + struct list_head trc_blkd_node; + int trc_blkd_cpu; +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ + struct sched_info sched_info; struct list_head tasks; -#ifdef CONFIG_SMP struct plist_node pushable_tasks; struct rb_node pushable_dl_tasks; -#endif struct mm_struct *mm; struct mm_struct *active_mm; + struct address_space *faults_disabled_mapping; - /* Per-thread vma caching: */ - struct vmacache vmacache; - -#ifdef SPLIT_RSS_COUNTING - struct task_rss_stat rss_stat; -#endif int exit_state; int exit_code; int exit_signal; @@ -708,43 +977,82 @@ struct task_struct { unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; - unsigned sched_remote_wakeup:1; -#ifdef CONFIG_PSI - unsigned sched_psi_wake_requeue:1; -#endif + unsigned sched_task_hot:1; /* Force alignment to the next boundary: */ unsigned :0; /* Unserialized, strictly 'current' */ - /* Bit to tell LSMs we're in execve(): */ + /* + * This field must not be in the scheduler word above due to wakelist + * queueing no longer being serialized by p->on_cpu. However: + * + * p->XXX = X; ttwu() + * schedule() if (p->on_rq && ..) // false + * smp_mb__after_spinlock(); if (smp_load_acquire(&p->on_cpu) && //true + * deactivate_task() ttwu_queue_wakelist()) + * p->on_rq = 0; p->sched_remote_wakeup = Y; + * + * guarantees all stores of 'current' are visible before + * ->sched_remote_wakeup gets used, so it can be in this word. + */ + unsigned sched_remote_wakeup:1; +#ifdef CONFIG_RT_MUTEXES + unsigned sched_rt_mutex:1; +#endif + + /* Bit to tell TOMOYO we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; #ifndef TIF_RESTORE_SIGMASK unsigned restore_sigmask:1; #endif -#ifdef CONFIG_MEMCG +#ifdef CONFIG_MEMCG_V1 unsigned in_user_fault:1; #endif +#ifdef CONFIG_LRU_GEN + /* whether the LRU algorithm may apply to this access */ + unsigned in_lru_fault:1; +#endif #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif #ifdef CONFIG_CGROUPS /* disallow userland-initiated cgroup migration */ unsigned no_cgroup_migration:1; + /* task is frozen/stopped (used by the cgroup freezer) */ + unsigned frozen:1; #endif #ifdef CONFIG_BLK_CGROUP - /* to be used once the psi infrastructure lands upstream. */ unsigned use_memdelay:1; #endif - - /* - * May usercopy functions fault on kernel addresses? - * This is not just a single bit because this can potentially nest. - */ - unsigned int kernel_uaccess_faults_ok; - +#ifdef CONFIG_PSI + /* Stalled due to lack of memory */ + unsigned in_memstall:1; +#endif +#ifdef CONFIG_PAGE_OWNER + /* Used by page_owner=on to detect recursion in page tracking. */ + unsigned in_page_owner:1; +#endif +#ifdef CONFIG_EVENTFD + /* Recursion prevention for eventfd_signal() */ + unsigned in_eventfd:1; +#endif +#ifdef CONFIG_ARCH_HAS_CPU_PASID + unsigned pasid_activated:1; +#endif +#ifdef CONFIG_X86_BUS_LOCK_DETECT + unsigned reported_split_lock:1; +#endif +#ifdef CONFIG_TASK_DELAY_ACCT + /* delay due to memory thrashing */ + unsigned in_thrashing:1; +#endif + unsigned in_nf_duplicate:1; +#ifdef CONFIG_PREEMPT_RT + struct netdev_xmit net_xmit; +#endif unsigned long atomic_flags; /* Flags requiring atomic access. */ struct restart_block restart_block; @@ -787,7 +1095,6 @@ struct task_struct { /* PID/PID hash table linkage. */ struct pid *thread_pid; struct hlist_node pid_links[PIDTYPE_MAX]; - struct list_head thread_group; struct list_head thread_node; struct completion *vfork_done; @@ -798,6 +1105,9 @@ struct task_struct { /* CLONE_CHILD_CLEARTID: */ int __user *clear_child_tid; + /* PF_KTHREAD | PF_IO_WORKER */ + void *worker_private; + u64 utime; u64 stime; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME @@ -821,15 +1131,17 @@ struct task_struct { u64 start_time; /* Boot based time in nsecs: */ - u64 real_start_time; + u64 start_boottime; /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */ unsigned long min_flt; unsigned long maj_flt; -#ifdef CONFIG_POSIX_TIMERS - struct task_cputime cputime_expires; - struct list_head cpu_timers[3]; + /* Empty if CONFIG_POSIX_CPUTIMERS=n */ + struct posix_cputimers posix_cputimers; + +#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK + struct posix_cputimers_work posix_cputimers_work; #endif /* Process credentials: */ @@ -843,12 +1155,20 @@ struct task_struct { /* Effective (overridable) subjective task credentials (COW): */ const struct cred __rcu *cred; +#ifdef CONFIG_KEYS + /* Cached requested key. */ + struct key *cached_requested_key; +#endif + /* * executable name, excluding path. * - * - normally initialized setup_new_exec() - * - access it with [gs]et_task_comm() - * - lock it with task_lock() + * - normally initialized begin_new_exec() + * - set it with set_task_comm() + * - strscpy_pad() to ensure it is always NUL-terminated and + * zero-padded + * - task_lock() to ensure the operation is atomic and the name is + * fully updated. */ char comm[TASK_COMM_LEN]; @@ -868,12 +1188,16 @@ struct task_struct { /* Open file information: */ struct files_struct *files; +#ifdef CONFIG_IO_URING + struct io_uring_task *io_uring; +#endif + /* Namespaces: */ struct nsproxy *nsproxy; /* Signal handlers: */ struct signal_struct *signal; - struct sighand_struct *sighand; + struct sighand_struct __rcu *sighand; sigset_t blocked; sigset_t real_blocked; /* Restored if set_restore_sigmask() was used: */ @@ -885,16 +1209,19 @@ struct task_struct { struct callback_head *task_works; - struct audit_context *audit_context; +#ifdef CONFIG_AUDIT #ifdef CONFIG_AUDITSYSCALL + struct audit_context *audit_context; +#endif kuid_t loginuid; unsigned int sessionid; #endif struct seccomp seccomp; + struct syscall_user_dispatch syscall_dispatch; /* Thread group tracking: */ - u32 parent_exec_id; - u32 self_exec_id; + u64 parent_exec_id; + u64 self_exec_id; /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */ spinlock_t alloc_lock; @@ -913,25 +1240,30 @@ struct task_struct { struct rt_mutex_waiter *pi_blocked_on; #endif -#ifdef CONFIG_DEBUG_MUTEXES - /* Mutex deadlock detection: */ - struct mutex_waiter *blocked_on; + struct mutex *blocked_on; /* lock we're blocked on */ + +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER + /* + * Encoded lock address causing task block (lower 2 bits = type from + * <linux/hung_task.h>). Accessed via hung_task_*() helpers. + */ + unsigned long blocker; +#endif + +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP + int non_block_count; #endif #ifdef CONFIG_TRACE_IRQFLAGS - unsigned int irq_events; - unsigned long hardirq_enable_ip; - unsigned long hardirq_disable_ip; - unsigned int hardirq_enable_event; - unsigned int hardirq_disable_event; - int hardirqs_enabled; - int hardirq_context; - unsigned long softirq_disable_ip; - unsigned long softirq_enable_ip; - unsigned int softirq_disable_event; - unsigned int softirq_enable_event; + struct irqtrace_events irqtrace; + unsigned int hardirq_threaded; + u64 hardirq_chain_key; int softirqs_enabled; int softirq_context; + int irq_config; +#endif +#ifdef CONFIG_PREEMPT_RT + int softirq_disable_cnt; #endif #ifdef CONFIG_LOCKDEP @@ -942,7 +1274,7 @@ struct task_struct { struct held_lock held_locks[MAX_LOCK_DEPTH]; #endif -#ifdef CONFIG_UBSAN +#if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP) unsigned int in_ubsan; #endif @@ -952,18 +1284,17 @@ struct task_struct { /* Stacked block device info: */ struct bio_list *bio_list; -#ifdef CONFIG_BLOCK /* Stack plugging: */ struct blk_plug *plug; -#endif /* VM state: */ struct reclaim_state *reclaim_state; - struct backing_dev_info *backing_dev_info; - struct io_context *io_context; +#ifdef CONFIG_COMPACTION + struct capture_control *capture_control; +#endif /* Ptrace state: */ unsigned long ptrace_message; kernel_siginfo_t *last_siginfo; @@ -984,18 +1315,20 @@ struct task_struct { #ifdef CONFIG_CPUSETS /* Protected by ->alloc_lock: */ nodemask_t mems_allowed; - /* Seqence number to catch updates: */ - seqcount_t mems_allowed_seq; + /* Sequence number to catch updates: */ + seqcount_spinlock_t mems_allowed_seq; int cpuset_mem_spread_rotor; - int cpuset_slab_spread_rotor; #endif #ifdef CONFIG_CGROUPS /* Control Group info protected by css_set_lock: */ struct css_set __rcu *cgroups; /* cg_list protected by css_set_lock and tsk->alloc_lock: */ struct list_head cg_list; -#endif -#ifdef CONFIG_X86_RESCTRL +#ifdef CONFIG_PREEMPT_RT + struct llist_node cg_dead_lnode; +#endif /* CONFIG_PREEMPT_RT */ +#endif /* CONFIG_CGROUPS */ +#ifdef CONFIG_X86_CPU_RESCTRL u32 closid; u32 rmid; #endif @@ -1006,11 +1339,15 @@ struct task_struct { #endif struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; + struct mutex futex_exit_mutex; + unsigned int futex_state; #endif #ifdef CONFIG_PERF_EVENTS - struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts]; + u8 perf_recursion[PERF_NR_CONTEXTS]; + struct perf_event_context *perf_event_ctxp; struct mutex perf_event_mutex; struct list_head perf_event_list; + struct perf_ctx_data __rcu *perf_ctx_data; #endif #ifdef CONFIG_DEBUG_PREEMPT unsigned long preempt_disable_ip; @@ -1019,6 +1356,7 @@ struct task_struct { /* Protected by alloc_lock: */ struct mempolicy *mempolicy; short il_prev; + u8 il_weight; short pref_node_fork; #endif #ifdef CONFIG_NUMA_BALANCING @@ -1033,7 +1371,15 @@ struct task_struct { u64 last_sum_exec_runtime; struct callback_head numa_work; - struct numa_group *numa_group; + /* + * This pointer is only modified for current in syscall and + * pagefault context (and for tasks being destroyed), so it can be read + * from any of the following contexts: + * - RCU read-side critical section + * - current->numa_group from everywhere + * - task's runqueue locked, task not running + */ + struct numa_group __rcu *numa_group; /* * numa_faults is an array split into four regions: @@ -1063,21 +1409,11 @@ struct task_struct { unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_RSEQ - struct rseq __user *rseq; - u32 rseq_len; - u32 rseq_sig; - /* - * RmW on rseq_event_mask must be performed atomically - * with respect to preemption. - */ - unsigned long rseq_event_mask; -#endif + struct rseq_data rseq; + struct sched_mm_cid mm_cid; struct tlbflush_unmap_batch tlb_ubc; - struct rcu_head rcu; - /* Cache last used pipe for splice(): */ struct pipe_inode_info *splice_pipe; @@ -1111,20 +1447,39 @@ struct task_struct { u64 timer_slack_ns; u64 default_timer_slack_ns; -#ifdef CONFIG_KASAN +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) unsigned int kasan_depth; #endif +#ifdef CONFIG_KCSAN + struct kcsan_ctx kcsan_ctx; +#ifdef CONFIG_TRACE_IRQFLAGS + struct irqtrace_events kcsan_save_irqtrace; +#endif +#ifdef CONFIG_KCSAN_WEAK_MEMORY + int kcsan_stack_depth; +#endif +#endif + +#ifdef CONFIG_KMSAN + struct kmsan_ctx kmsan_ctx; +#endif + +#if IS_ENABLED(CONFIG_KUNIT) + struct kunit *kunit_test; +#endif + #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack: */ int curr_ret_stack; int curr_ret_depth; /* Stack of return addresses for return function tracing: */ - struct ftrace_ret_stack *ret_stack; + unsigned long *ret_stack; /* Timestamp for last schedule: */ unsigned long long ftrace_timestamp; + unsigned long long ftrace_sleeptime; /* * Number of functions that haven't been traced @@ -1137,14 +1492,13 @@ struct task_struct { #endif #ifdef CONFIG_TRACING - /* State flags for use by tracers: */ - unsigned long trace; - /* Bitmask and counter of trace recursion: */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ #ifdef CONFIG_KCOV + /* See kernel/kcov.c for more details. */ + /* Coverage collection mode enabled for this task (0 if disabled): */ unsigned int kcov_mode; @@ -1156,22 +1510,34 @@ struct task_struct { /* KCOV descriptor wired with this task or NULL: */ struct kcov *kcov; + + /* KCOV common handle for remote coverage collection: */ + u64 kcov_handle; + + /* KCOV sequence number: */ + int kcov_sequence; + + /* Collect coverage from softirq context: */ + unsigned int kcov_softirq; #endif -#ifdef CONFIG_MEMCG +#ifdef CONFIG_MEMCG_V1 struct mem_cgroup *memcg_in_oom; - gfp_t memcg_oom_gfp_mask; - int memcg_oom_order; +#endif +#ifdef CONFIG_MEMCG /* Number of pages to reclaim on returning to userland: */ unsigned int memcg_nr_pages_over_high; /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; + + /* Cache for current->cgroups->memcg->objcg lookups: */ + struct obj_cgroup *objcg; #endif #ifdef CONFIG_BLK_CGROUP - struct request_queue *throttle_queue; + struct gendisk *throttle_disk; #endif #ifdef CONFIG_UPROBES @@ -1181,19 +1547,26 @@ struct task_struct { unsigned int sequential_io; unsigned int sequential_io_avg; #endif + struct kmap_ctrl kmap_ctrl; #ifdef CONFIG_DEBUG_ATOMIC_SLEEP unsigned long task_state_change; +# ifdef CONFIG_PREEMPT_RT + unsigned long saved_state_change; +# endif #endif + struct rcu_head rcu; + refcount_t rcu_users; int pagefault_disabled; #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; + struct timer_list oom_reaper_timer; #endif #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; #endif #ifdef CONFIG_THREAD_INFO_IN_TASK /* A live task holds one reference: */ - atomic_t stack_refcount; + refcount_t stack_refcount; #endif #ifdef CONFIG_LIVEPATCH int patch_state; @@ -1202,158 +1575,124 @@ struct task_struct { /* Used by LSM modules for access restriction: */ void *security; #endif +#ifdef CONFIG_BPF_SYSCALL + /* Used by BPF task local storage */ + struct bpf_local_storage __rcu *bpf_storage; + /* Used for BPF run context */ + struct bpf_run_ctx *bpf_ctx; +#endif + /* Used by BPF for per-TASK xdp storage */ + struct bpf_net_context *bpf_net_context; -#ifdef CONFIG_GCC_PLUGIN_STACKLEAK +#ifdef CONFIG_KSTACK_ERASE unsigned long lowest_stack; +#endif +#ifdef CONFIG_KSTACK_ERASE_METRICS unsigned long prev_lowest_stack; #endif - /* - * New fields for task_struct should be added above here, so that - * they are included in the randomized portion of task_struct. - */ - randomized_struct_fields_end +#ifdef CONFIG_X86_MCE + void __user *mce_vaddr; + __u64 mce_kflags; + u64 mce_addr; + __u64 mce_ripv : 1, + mce_whole_page : 1, + __mce_reserved : 62; + struct callback_head mce_kill_me; + int mce_count; +#endif - /* CPU-specific state of this task: */ - struct thread_struct thread; +#ifdef CONFIG_KRETPROBES + struct llist_head kretprobe_instances; +#endif +#ifdef CONFIG_RETHOOK + struct llist_head rethooks; +#endif +#ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH /* - * WARNING: on x86, 'thread_struct' contains a variable-sized - * structure. It *MUST* be at the end of 'task_struct'. - * - * Do not put anything below here! + * If L1D flush is supported on mm context switch + * then we use this callback head to queue kill work + * to kill tasks that are not running on SMT disabled + * cores */ -}; - -static inline struct pid *task_pid(struct task_struct *task) -{ - return task->thread_pid; -} - -/* - * the helpers to get the task's different pids as they are seen - * from various namespaces - * - * task_xid_nr() : global id, i.e. the id seen from the init namespace; - * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of - * current. - * task_xid_nr_ns() : id seen from the ns specified; - * - * see also pid_nr() etc in include/linux/pid.h - */ -pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns); - -static inline pid_t task_pid_nr(struct task_struct *tsk) -{ - return tsk->pid; -} - -static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns); -} - -static inline pid_t task_pid_vnr(struct task_struct *tsk) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); -} - - -static inline pid_t task_tgid_nr(struct task_struct *tsk) -{ - return tsk->tgid; -} - -/** - * pid_alive - check that a task structure is not stale - * @p: Task structure to be checked. - * - * Test if a process is not yet dead (at most zombie state) - * If pid_alive fails, then pointers within the task structure - * can be stale and must not be dereferenced. - * - * Return: 1 if the process is alive. 0 otherwise. - */ -static inline int pid_alive(const struct task_struct *p) -{ - return p->thread_pid != NULL; -} - -static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); -} - -static inline pid_t task_pgrp_vnr(struct task_struct *tsk) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL); -} - - -static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns); -} - -static inline pid_t task_session_vnr(struct task_struct *tsk) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); -} + struct callback_head l1d_flush_kill; +#endif -static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns); -} +#ifdef CONFIG_RV + /* + * Per-task RV monitor, fixed in CONFIG_RV_PER_TASK_MONITORS. + * If memory becomes a concern, we can think about a dynamic method. + */ + union rv_task_monitor rv[CONFIG_RV_PER_TASK_MONITORS]; +#endif -static inline pid_t task_tgid_vnr(struct task_struct *tsk) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL); -} +#ifdef CONFIG_USER_EVENTS + struct user_event_mm *user_event_mm; +#endif -static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) -{ - pid_t pid = 0; +#ifdef CONFIG_UNWIND_USER + struct unwind_task_info unwind_info; +#endif - rcu_read_lock(); - if (pid_alive(tsk)) - pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); - rcu_read_unlock(); + /* CPU-specific state of this task: */ + struct thread_struct thread; - return pid; -} + /* + * New fields for task_struct should be added above here, so that + * they are included in the randomized portion of task_struct. + */ + randomized_struct_fields_end +} __attribute__ ((aligned (64))); -static inline pid_t task_ppid_nr(const struct task_struct *tsk) +#ifdef CONFIG_SCHED_PROXY_EXEC +DECLARE_STATIC_KEY_TRUE(__sched_proxy_exec); +static inline bool sched_proxy_exec(void) { - return task_ppid_nr_ns(tsk, &init_pid_ns); + return static_branch_likely(&__sched_proxy_exec); } - -/* Obsolete, do not use: */ -static inline pid_t task_pgrp_nr(struct task_struct *tsk) +#else +static inline bool sched_proxy_exec(void) { - return task_pgrp_nr_ns(tsk, &init_pid_ns); + return false; } +#endif #define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) -static inline unsigned int task_state_index(struct task_struct *tsk) +static inline unsigned int __task_state_index(unsigned int tsk_state, + unsigned int tsk_exit_state) { - unsigned int tsk_state = READ_ONCE(tsk->state); - unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; + unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT; BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); - if (tsk_state == TASK_IDLE) + if ((tsk_state & TASK_IDLE) == TASK_IDLE) state = TASK_REPORT_IDLE; + /* + * We're lying here, but rather than expose a completely new task state + * to userspace, we can make this appear as if the task has gone through + * a regular rt_mutex_lock() call. + * Report frozen tasks as uninterruptible. + */ + if ((tsk_state & TASK_RTLOCK_WAIT) || (tsk_state & TASK_FROZEN)) + state = TASK_UNINTERRUPTIBLE; + return fls(state); } +static inline unsigned int task_state_index(struct task_struct *tsk) +{ + return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state); +} + static inline char task_index_to_char(unsigned int state) { static const char state_char[] = "RSDTtXZPI"; - BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1); + BUILD_BUG_ON(TASK_REPORT_MAX * 2 != 1 << (sizeof(state_char) - 1)); return state_char[state]; } @@ -1363,54 +1702,44 @@ static inline char task_state_to_char(struct task_struct *tsk) return task_index_to_char(task_state_index(tsk)); } -/** - * is_global_init - check if a task structure is init. Since init - * is free to have sub-threads we need to check tgid. - * @tsk: Task structure to be checked. - * - * Check if a task structure is the first user space task the kernel created. - * - * Return: 1 if the task structure is init. 0 otherwise. - */ -static inline int is_global_init(struct task_struct *tsk) -{ - return task_tgid_nr(tsk) == 1; -} - extern struct pid *cad_pid; /* * Per process flags */ +#define PF_VCPU 0x00000001 /* I'm a virtual CPU */ #define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* Getting shut down */ -#define PF_EXITPIDONE 0x00000008 /* PI exit done on shut down */ -#define PF_VCPU 0x00000010 /* I'm a virtual CPU */ +#define PF_POSTCOREDUMP 0x00000008 /* Coredumps should ignore this task */ +#define PF_IO_WORKER 0x00000010 /* Task is an IO worker */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ #define PF_MCE_PROCESS 0x00000080 /* Process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* Dumped core */ #define PF_SIGNALED 0x00000400 /* Killed by a signal */ -#define PF_MEMALLOC 0x00000800 /* Allocating memory */ +#define PF_MEMALLOC 0x00000800 /* Allocating memory to free memory. See memalloc_noreclaim_save() */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ -#define PF_USED_ASYNC 0x00004000 /* Used async_schedule*(), used by module init */ +#define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ -#define PF_FROZEN 0x00010000 /* Frozen for system suspend */ +#define PF_KCOMPACTD 0x00010000 /* I am kcompactd */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ -#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ -#define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ -#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ +#define PF_MEMALLOC_NOFS 0x00040000 /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */ +#define PF_MEMALLOC_NOIO 0x00080000 /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */ +#define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to, + * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ -#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ -#define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */ -#define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ -#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ +#define PF__HOLE__00800000 0x00800000 +#define PF__HOLE__01000000 0x01000000 +#define PF__HOLE__02000000 0x02000000 +#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ -#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ -#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ +#define PF_MEMALLOC_PIN 0x10000000 /* Allocations constrained to zones which allow long term pinning. + * See memalloc_pin_save() */ +#define PF_BLOCK_TS 0x20000000 /* plug has ts that needs updating */ +#define PF__HOLE__40000000 0x40000000 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ /* @@ -1441,14 +1770,10 @@ extern struct pid *cad_pid; #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) -static inline bool is_percpu_thread(void) +static __always_inline bool is_percpu_thread(void) { -#ifdef CONFIG_SMP return (current->flags & PF_NO_SETAFFINITY) && (current->nr_cpus_allowed == 1); -#else - return true; -#endif } /* Per-process atomic flags. */ @@ -1459,6 +1784,7 @@ static inline bool is_percpu_thread(void) #define PFA_SPEC_SSB_FORCE_DISABLE 4 /* Speculative Store Bypass force disabled*/ #define PFA_SPEC_IB_DISABLE 5 /* Indirect branch speculation restricted */ #define PFA_SPEC_IB_FORCE_DISABLE 6 /* Indirect branch speculation permanently restricted */ +#define PFA_SPEC_SSB_NOEXEC 7 /* Speculative Store Bypass clear on execve() */ #define TASK_PFA_TEST(name, func) \ static inline bool task_##func(struct task_struct *p) \ @@ -1487,6 +1813,10 @@ TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable) +TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec) +TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec) +TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec) + TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) @@ -1505,25 +1835,26 @@ current_restore_flags(unsigned long orig_flags, unsigned long flags) } extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); -extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); -#ifdef CONFIG_SMP -extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); -extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); -#else -static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -{ -} -static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -{ - if (!cpumask_test_cpu(0, new_mask)) - return -EINVAL; - return 0; -} -#endif +extern int task_can_attach(struct task_struct *p); +extern int dl_bw_alloc(int cpu, u64 dl_bw); +extern void dl_bw_free(int cpu, u64 dl_bw); -#ifndef cpu_relax_yield -#define cpu_relax_yield() cpu_relax() -#endif +/* set_cpus_allowed_force() - consider using set_cpus_allowed_ptr() instead */ +extern void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask); + +/** + * set_cpus_allowed_ptr - set CPU affinity mask of a task + * @p: the task + * @new_mask: CPU affinity mask + * + * Return: zero if successful, or a negative error code + */ +extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); +extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node); +extern void release_user_cpus_ptr(struct task_struct *p); +extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask); +extern void force_compatible_cpus_allowed_ptr(struct task_struct *p); +extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p); extern int yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); @@ -1546,6 +1877,10 @@ extern int idle_cpu(int cpu); extern int available_idle_cpu(int cpu); extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); +extern void sched_set_fifo(struct task_struct *p); +extern void sched_set_fifo_low(struct task_struct *p); +extern void sched_set_fifo_secondary(struct task_struct *p); +extern void sched_set_normal(struct task_struct *p, int nice); extern int sched_setattr(struct task_struct *, const struct sched_attr *); extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); extern struct task_struct *idle_task(int cpu); @@ -1556,7 +1891,7 @@ extern struct task_struct *idle_task(int cpu); * * Return: 1 if @p is an idle task. 0 otherwise. */ -static inline bool is_idle_task(const struct task_struct *p) +static __always_inline bool is_idle_task(const struct task_struct *p) { return !!(p->flags & PF_IDLE); } @@ -1567,9 +1902,7 @@ extern void ia64_set_curr_task(int cpu, struct task_struct *p); void yield(void); union thread_union { -#ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK struct task_struct task; -#endif #ifndef CONFIG_THREAD_INFO_IN_TASK struct thread_info thread_info; #endif @@ -1583,11 +1916,8 @@ extern struct thread_info init_thread_info; extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)]; #ifdef CONFIG_THREAD_INFO_IN_TASK -static inline struct thread_info *task_thread_info(struct task_struct *task) -{ - return &task->thread_info; -} -#elif !defined(__HAVE_THREAD_FUNCTIONS) +# define task_thread_info(task) (&(task)->thread_info) +#else # define task_thread_info(task) ((struct thread_info *)(task)->stack) #endif @@ -1614,35 +1944,44 @@ extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); extern void wake_up_new_task(struct task_struct *tsk); -#ifdef CONFIG_SMP extern void kick_process(struct task_struct *tsk); -#else -static inline void kick_process(struct task_struct *tsk) { } -#endif extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); +#define set_task_comm(tsk, from) ({ \ + BUILD_BUG_ON(sizeof(from) != TASK_COMM_LEN); \ + __set_task_comm(tsk, from, false); \ +}) -static inline void set_task_comm(struct task_struct *tsk, const char *from) -{ - __set_task_comm(tsk, from, false); -} - -extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); +/* + * - Why not use task_lock()? + * User space can randomly change their names anyway, so locking for readers + * doesn't make sense. For writers, locking is probably necessary, as a race + * condition could lead to long-term mixed results. + * The strscpy_pad() in __set_task_comm() can ensure that the task comm is + * always NUL-terminated and zero-padded. Therefore the race condition between + * reader and writer is not an issue. + * + * - BUILD_BUG_ON() can help prevent the buf from being truncated. + * Since the callers don't perform any return value checks, this safeguard is + * necessary. + */ #define get_task_comm(buf, tsk) ({ \ - BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN); \ - __get_task_comm(buf, sizeof(buf), tsk); \ + BUILD_BUG_ON(sizeof(buf) < TASK_COMM_LEN); \ + strscpy_pad(buf, (tsk)->comm); \ + buf; \ }) -#ifdef CONFIG_SMP -void scheduler_ipi(void); -extern unsigned long wait_task_inactive(struct task_struct *, long match_state); -#else -static inline void scheduler_ipi(void) { } -static inline unsigned long wait_task_inactive(struct task_struct *p, long match_state) +static __always_inline void scheduler_ipi(void) { - return 1; + /* + * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting + * TIF_NEED_RESCHED remotely (for the first time) will also send + * this IPI. + */ + preempt_fold_need_resched(); } -#endif + +extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); /* * Set thread flags in other task's structures. @@ -1681,12 +2020,16 @@ static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) static inline void set_tsk_need_resched(struct task_struct *tsk) { + if (tracepoint_enabled(sched_set_need_resched_tp) && + !test_tsk_thread_flag(tsk, TIF_NEED_RESCHED)) + __trace_set_need_resched(tsk, TIF_NEED_RESCHED); set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); } static inline void clear_tsk_need_resched(struct task_struct *tsk) { - clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); + atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY, + (atomic_long_t *)&task_thread_info(tsk)->flags); } static inline int test_tsk_need_resched(struct task_struct *tsk) @@ -1694,52 +2037,166 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } +static inline void set_need_resched_current(void) +{ + lockdep_assert_irqs_disabled(); + set_tsk_need_resched(current); + set_preempt_need_resched(); +} + /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return * value indicates whether a reschedule was done in fact. * cond_resched_lock() will drop the spinlock before scheduling, */ -#ifndef CONFIG_PREEMPT -extern int _cond_resched(void); -#else -static inline int _cond_resched(void) { return 0; } -#endif +#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) +extern int __cond_resched(void); + +#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) + +DECLARE_STATIC_CALL(cond_resched, __cond_resched); + +static __always_inline int _cond_resched(void) +{ + return static_call_mod(cond_resched)(); +} + +#elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) + +extern int dynamic_cond_resched(void); + +static __always_inline int _cond_resched(void) +{ + return dynamic_cond_resched(); +} + +#else /* !CONFIG_PREEMPTION */ + +static inline int _cond_resched(void) +{ + return __cond_resched(); +} + +#endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ + +#else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */ + +static inline int _cond_resched(void) +{ + return 0; +} + +#endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */ #define cond_resched() ({ \ - ___might_sleep(__FILE__, __LINE__, 0); \ + __might_resched(__FILE__, __LINE__, 0); \ _cond_resched(); \ }) extern int __cond_resched_lock(spinlock_t *lock); +extern int __cond_resched_rwlock_read(rwlock_t *lock); +extern int __cond_resched_rwlock_write(rwlock_t *lock); + +#define MIGHT_RESCHED_RCU_SHIFT 8 +#define MIGHT_RESCHED_PREEMPT_MASK ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1) -#define cond_resched_lock(lock) ({ \ - ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\ - __cond_resched_lock(lock); \ +#ifndef CONFIG_PREEMPT_RT +/* + * Non RT kernels have an elevated preempt count due to the held lock, + * but are not allowed to be inside a RCU read side critical section + */ +# define PREEMPT_LOCK_RESCHED_OFFSETS PREEMPT_LOCK_OFFSET +#else +/* + * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in + * cond_resched*lock() has to take that into account because it checks for + * preempt_count() and rcu_preempt_depth(). + */ +# define PREEMPT_LOCK_RESCHED_OFFSETS \ + (PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT)) +#endif + +#define cond_resched_lock(lock) ({ \ + __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ + __cond_resched_lock(lock); \ +}) + +#define cond_resched_rwlock_read(lock) ({ \ + __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ + __cond_resched_rwlock_read(lock); \ +}) + +#define cond_resched_rwlock_write(lock) ({ \ + __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ + __cond_resched_rwlock_write(lock); \ }) -static inline void cond_resched_rcu(void) +#ifndef CONFIG_PREEMPT_RT +static inline struct mutex *__get_task_blocked_on(struct task_struct *p) { -#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) - rcu_read_unlock(); - cond_resched(); - rcu_read_lock(); -#endif + struct mutex *m = p->blocked_on; + + if (m) + lockdep_assert_held_once(&m->wait_lock); + return m; } -/* - * Does a critical section need to be broken due to another - * task waiting?: (technically does not depend on CONFIG_PREEMPT, - * but a general need for low latency) - */ -static inline int spin_needbreak(spinlock_t *lock) +static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) +{ + struct mutex *blocked_on = READ_ONCE(p->blocked_on); + + WARN_ON_ONCE(!m); + /* The task should only be setting itself as blocked */ + WARN_ON_ONCE(p != current); + /* Currently we serialize blocked_on under the mutex::wait_lock */ + lockdep_assert_held_once(&m->wait_lock); + /* + * Check ensure we don't overwrite existing mutex value + * with a different mutex. Note, setting it to the same + * lock repeatedly is ok. + */ + WARN_ON_ONCE(blocked_on && blocked_on != m); + WRITE_ONCE(p->blocked_on, m); +} + +static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m) +{ + guard(raw_spinlock_irqsave)(&m->wait_lock); + __set_task_blocked_on(p, m); +} + +static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m) +{ + if (m) { + struct mutex *blocked_on = READ_ONCE(p->blocked_on); + + /* Currently we serialize blocked_on under the mutex::wait_lock */ + lockdep_assert_held_once(&m->wait_lock); + /* + * There may be cases where we re-clear already cleared + * blocked_on relationships, but make sure we are not + * clearing the relationship with a different lock. + */ + WARN_ON_ONCE(blocked_on && blocked_on != m); + } + WRITE_ONCE(p->blocked_on, NULL); +} + +static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) { -#ifdef CONFIG_PREEMPT - return spin_is_contended(lock); + guard(raw_spinlock_irqsave)(&m->wait_lock); + __clear_task_blocked_on(p, m); +} #else - return 0; -#endif +static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) +{ +} + +static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) +{ } +#endif /* !CONFIG_PREEMPT_RT */ static __always_inline bool need_resched(void) { @@ -1753,11 +2210,7 @@ static __always_inline bool need_resched(void) static inline unsigned int task_cpu(const struct task_struct *p) { -#ifdef CONFIG_THREAD_INFO_IN_TASK - return p->cpu; -#else - return task_thread_info(p)->cpu; -#endif + return READ_ONCE(task_thread_info(p)->cpu); } extern void set_task_cpu(struct task_struct *p, unsigned int cpu); @@ -1775,6 +2228,15 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) #endif /* CONFIG_SMP */ +static inline bool task_is_runnable(struct task_struct *p) +{ + return p->on_rq && !p->se.sched_delayed; +} + +extern bool sched_task_on_rq(struct task_struct *p); +extern unsigned long get_wchan(struct task_struct *p); +extern struct task_struct *cpu_curr_snapshot(int cpu); + /* * In order to reduce various lock holder preemption latencies provide an * interface to see if a vCPU is currently running or not. @@ -1784,7 +2246,10 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) * running or not. */ #ifndef vcpu_is_preempted -# define vcpu_is_preempted(cpu) false +static inline bool vcpu_is_preempted(int cpu) +{ + return false; +} #endif extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); @@ -1794,135 +2259,185 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask); #define TASK_SIZE_OF(tsk) TASK_SIZE #endif -#ifdef CONFIG_RSEQ - -/* - * Map the event mask on the user-space ABI enum rseq_cs_flags - * for direct mask checks. - */ -enum rseq_event_mask_bits { - RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, - RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, - RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, -}; - -enum rseq_event_mask { - RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), - RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), - RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), -}; - -static inline void rseq_set_notify_resume(struct task_struct *t) +static inline bool owner_on_cpu(struct task_struct *owner) { - if (t->rseq) - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); + /* + * As lock holder preemption issue, we both skip spinning if + * task is not on cpu or its cpu is preempted + */ + return READ_ONCE(owner->on_cpu) && !vcpu_is_preempted(task_cpu(owner)); } -void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); +/* Returns effective CPU energy utilization, as seen by the scheduler */ +unsigned long sched_cpu_util(int cpu); + +#ifdef CONFIG_SCHED_CORE +extern void sched_core_free(struct task_struct *tsk); +extern void sched_core_fork(struct task_struct *p); +extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, + unsigned long uaddr); +extern int sched_core_idle_cpu(int cpu); +#else +static inline void sched_core_free(struct task_struct *tsk) { } +static inline void sched_core_fork(struct task_struct *p) { } +static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } +#endif -static inline void rseq_handle_notify_resume(struct ksignal *ksig, - struct pt_regs *regs) +extern void sched_set_stop_task(int cpu, struct task_struct *stop); + +#ifdef CONFIG_MEM_ALLOC_PROFILING +static __always_inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { - if (current->rseq) - __rseq_handle_notify_resume(ksig, regs); + swap(current->alloc_tag, tag); + return tag; } -static inline void rseq_signal_deliver(struct ksignal *ksig, - struct pt_regs *regs) +static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) { - preempt_disable(); - __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); - preempt_enable(); - rseq_handle_notify_resume(ksig, regs); +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n"); +#endif + current->alloc_tag = old; } +#else +#define alloc_tag_save(_tag) NULL +#define alloc_tag_restore(_tag, _old) do {} while (0) +#endif -/* rseq_preempt() requires preemption to be disabled. */ -static inline void rseq_preempt(struct task_struct *t) +/* Avoids recursive inclusion hell */ +#ifdef CONFIG_SCHED_MM_CID +void sched_mm_cid_before_execve(struct task_struct *t); +void sched_mm_cid_after_execve(struct task_struct *t); +void sched_mm_cid_fork(struct task_struct *t); +void sched_mm_cid_exit(struct task_struct *t); +static __always_inline int task_mm_cid(struct task_struct *t) { - __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); - rseq_set_notify_resume(t); + return t->mm_cid.cid & ~(MM_CID_ONCPU | MM_CID_TRANSIT); } - -/* rseq_migrate() requires preemption to be disabled. */ -static inline void rseq_migrate(struct task_struct *t) +#else +static inline void sched_mm_cid_before_execve(struct task_struct *t) { } +static inline void sched_mm_cid_after_execve(struct task_struct *t) { } +static inline void sched_mm_cid_fork(struct task_struct *t) { } +static inline void sched_mm_cid_exit(struct task_struct *t) { } +static __always_inline int task_mm_cid(struct task_struct *t) { - __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); - rseq_set_notify_resume(t); + /* + * Use the processor id as a fall-back when the mm cid feature is + * disabled. This provides functional per-cpu data structure accesses + * in user-space, althrough it won't provide the memory usage benefits. + */ + return task_cpu(t); } +#endif + +#ifndef MODULE +#ifndef COMPILE_OFFSETS + +extern void ___migrate_enable(void); + +struct rq; +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); /* - * If parent process has a registered restartable sequences area, the - * child inherits. Only applies when forking a process, not a thread. + * The "struct rq" is not available here, so we can't access the + * "runqueues" with this_cpu_ptr(), as the compilation will fail in + * this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr(): + * typeof((ptr) + 0) + * + * So use arch_raw_cpu_ptr()/PERCPU_PTR() directly here. */ -static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) -{ - if (clone_flags & CLONE_THREAD) { - t->rseq = NULL; - t->rseq_len = 0; - t->rseq_sig = 0; - t->rseq_event_mask = 0; - } else { - t->rseq = current->rseq; - t->rseq_len = current->rseq_len; - t->rseq_sig = current->rseq_sig; - t->rseq_event_mask = current->rseq_event_mask; - } -} +#ifdef CONFIG_SMP +#define this_rq_raw() arch_raw_cpu_ptr(&runqueues) +#else +#define this_rq_raw() PERCPU_PTR(&runqueues) +#endif +#define this_rq_pinned() (*(unsigned int *)((void *)this_rq_raw() + RQ_nr_pinned)) -static inline void rseq_execve(struct task_struct *t) +static inline void __migrate_enable(void) { - t->rseq = NULL; - t->rseq_len = 0; - t->rseq_sig = 0; - t->rseq_event_mask = 0; -} + struct task_struct *p = current; -#else +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Check both overflow from migrate_disable() and superfluous + * migrate_enable(). + */ + if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) + return; +#endif -static inline void rseq_set_notify_resume(struct task_struct *t) -{ -} -static inline void rseq_handle_notify_resume(struct ksignal *ksig, - struct pt_regs *regs) -{ -} -static inline void rseq_signal_deliver(struct ksignal *ksig, - struct pt_regs *regs) -{ -} -static inline void rseq_preempt(struct task_struct *t) -{ -} -static inline void rseq_migrate(struct task_struct *t) -{ -} -static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) -{ + if (p->migration_disabled > 1) { + p->migration_disabled--; + return; + } + + /* + * Ensure stop_task runs either before or after this, and that + * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). + */ + guard(preempt)(); + if (unlikely(p->cpus_ptr != &p->cpus_mask)) + ___migrate_enable(); + /* + * Mustn't clear migration_disabled() until cpus_ptr points back at the + * regular cpus_mask, otherwise things that race (eg. + * select_fallback_rq) get confused. + */ + barrier(); + p->migration_disabled = 0; + this_rq_pinned()--; } -static inline void rseq_execve(struct task_struct *t) + +static inline void __migrate_disable(void) { -} + struct task_struct *p = current; + if (p->migration_disabled) { +#ifdef CONFIG_DEBUG_PREEMPT + /* + *Warn about overflow half-way through the range. + */ + WARN_ON_ONCE((s16)p->migration_disabled < 0); #endif + p->migration_disabled++; + return; + } -void __exit_umh(struct task_struct *tsk); + guard(preempt)(); + this_rq_pinned()++; + p->migration_disabled = 1; +} +#else /* !COMPILE_OFFSETS */ +static inline void __migrate_disable(void) { } +static inline void __migrate_enable(void) { } +#endif /* !COMPILE_OFFSETS */ -static inline void exit_umh(struct task_struct *tsk) +/* + * So that it is possible to not export the runqueues variable, define and + * export migrate_enable/migrate_disable in kernel/sched/core.c too, and use + * them for the modules. The macro "INSTANTIATE_EXPORTED_MIGRATE_DISABLE" will + * be defined in kernel/sched/core.c. + */ +#ifndef INSTANTIATE_EXPORTED_MIGRATE_DISABLE +static __always_inline void migrate_disable(void) { - if (unlikely(tsk->flags & PF_UMH)) - __exit_umh(tsk); + __migrate_disable(); } -#ifdef CONFIG_DEBUG_RSEQ - -void rseq_syscall(struct pt_regs *regs); - -#else - -static inline void rseq_syscall(struct pt_regs *regs) +static __always_inline void migrate_enable(void) { + __migrate_enable(); } +#else /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ +extern void migrate_disable(void); +extern void migrate_enable(void); +#endif /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ -#endif +#else /* MODULE */ +extern void migrate_disable(void); +extern void migrate_enable(void); +#endif /* MODULE */ + +DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) #endif |
