diff options
Diffstat (limited to 'include/linux/sched.h')
| -rw-r--r-- | include/linux/sched.h | 3993 |
1 files changed, 1897 insertions, 2096 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index 50d04b92ceda..d395f2810fac 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1,1633 +1,1746 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_H #define _LINUX_SCHED_H -#include <uapi/linux/sched.h> - - -struct sched_param { - int sched_priority; -}; +/* + * Define 'struct task_struct' and provide the main scheduler + * APIs (schedule(), wakeup variants, etc.) + */ -#include <asm/param.h> /* for HZ */ +#include <uapi/linux/sched.h> -#include <linux/capability.h> -#include <linux/threads.h> -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/timex.h> -#include <linux/jiffies.h> -#include <linux/rbtree.h> +#include <asm/current.h> +#include <asm/processor.h> #include <linux/thread_info.h> -#include <linux/cpumask.h> -#include <linux/errno.h> -#include <linux/nodemask.h> -#include <linux/mm_types.h> - -#include <asm/page.h> -#include <asm/ptrace.h> -#include <asm/cputime.h> - -#include <linux/smp.h> -#include <linux/sem.h> -#include <linux/signal.h> -#include <linux/compiler.h> -#include <linux/completion.h> -#include <linux/pid.h> -#include <linux/percpu.h> -#include <linux/topology.h> -#include <linux/proportions.h> -#include <linux/seccomp.h> -#include <linux/rcupdate.h> -#include <linux/rculist.h> -#include <linux/rtmutex.h> - -#include <linux/time.h> -#include <linux/param.h> +#include <linux/preempt.h> +#include <linux/cpumask_types.h> + +#include <linux/cache.h> +#include <linux/irqflags_types.h> +#include <linux/smp_types.h> +#include <linux/pid_types.h> +#include <linux/sem_types.h> +#include <linux/shm.h> +#include <linux/kmsan_types.h> +#include <linux/mutex_types.h> +#include <linux/plist_types.h> +#include <linux/hrtimer_types.h> +#include <linux/timer_types.h> +#include <linux/seccomp_types.h> +#include <linux/nodemask_types.h> +#include <linux/refcount_types.h> #include <linux/resource.h> -#include <linux/timer.h> -#include <linux/hrtimer.h> -#include <linux/task_io_accounting.h> #include <linux/latencytop.h> -#include <linux/cred.h> -#include <linux/llist.h> -#include <linux/uidgid.h> -#include <linux/gfp.h> - -#include <asm/processor.h> - -struct exec_domain; -struct futex_pi_state; -struct robust_list_head; +#include <linux/sched/prio.h> +#include <linux/sched/types.h> +#include <linux/signal_types.h> +#include <linux/spinlock.h> +#include <linux/syscall_user_dispatch_types.h> +#include <linux/mm_types_task.h> +#include <linux/netdevice_xmit.h> +#include <linux/task_io_accounting.h> +#include <linux/posix-timers_types.h> +#include <linux/restart_block.h> +#include <linux/rseq_types.h> +#include <linux/seqlock_types.h> +#include <linux/kcsan.h> +#include <linux/rv.h> +#include <linux/uidgid_types.h> +#include <linux/tracepoint-defs.h> +#include <linux/unwind_deferred_types.h> +#include <asm/kmap_size.h> +#ifndef COMPILE_OFFSETS +#include <generated/rq-offsets.h> +#endif + +/* task_struct member predeclarations (sorted alphabetically): */ +struct audit_context; struct bio_list; +struct blk_plug; +struct bpf_local_storage; +struct bpf_run_ctx; +struct bpf_net_context; +struct capture_control; +struct cfs_rq; struct fs_struct; +struct futex_pi_state; +struct io_context; +struct io_uring_task; +struct mempolicy; +struct nameidata; +struct nsproxy; struct perf_event_context; -struct blk_plug; - -/* - * List of flags we want to share for kernel threads, - * if only because they are not used by them anyway. - */ -#define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND) - -/* - * These are the constant used to fake the fixed-point load-average - * counting. Some notes: - * - 11 bit fractions expand to 22 bits by the multiplies: this gives - * a load-average precision of 10 bits integer + 11 bits fractional - * - if you want to count load-averages more often, you need more - * precision, or rounding will get you. With 2-second counting freq, - * the EXP_n values would be 1981, 2034 and 2043 if still using only - * 11 bit fractions. - */ -extern unsigned long avenrun[]; /* Load averages */ -extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift); - -#define FSHIFT 11 /* nr of bits of precision */ -#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */ -#define LOAD_FREQ (5*HZ+1) /* 5 sec intervals */ -#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */ -#define EXP_5 2014 /* 1/exp(5sec/5min) */ -#define EXP_15 2037 /* 1/exp(5sec/15min) */ - -#define CALC_LOAD(load,exp,n) \ - load *= exp; \ - load += n*(FIXED_1-exp); \ - load >>= FSHIFT; - -extern unsigned long total_forks; -extern int nr_threads; -DECLARE_PER_CPU(unsigned long, process_counts); -extern int nr_processes(void); -extern unsigned long nr_running(void); -extern unsigned long nr_iowait(void); -extern unsigned long nr_iowait_cpu(int cpu); -extern unsigned long this_cpu_load(void); - - -extern void calc_global_load(unsigned long ticks); -extern void update_cpu_load_nohz(void); - -/* Notifier for when a task gets migrated to a new CPU */ -struct task_migration_notifier { - struct task_struct *task; - int from_cpu; - int to_cpu; -}; -extern void register_task_migration_notifier(struct notifier_block *n); - -extern unsigned long get_parent_ip(unsigned long addr); - -extern void dump_cpu_task(int cpu); - +struct perf_ctx_data; +struct pid_namespace; +struct pipe_inode_info; +struct rcu_node; +struct reclaim_state; +struct robust_list_head; +struct root_domain; +struct rq; +struct sched_attr; +struct sched_dl_entity; struct seq_file; -struct cfs_rq; +struct sighand_struct; +struct signal_struct; +struct task_delay_info; struct task_group; -#ifdef CONFIG_SCHED_DEBUG -extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); -extern void proc_sched_set_task(struct task_struct *p); -extern void -print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); -#endif +struct task_struct; +struct user_event_mm; + +#include <linux/sched/ext.h> /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). * - * We have two separate sets of flags: task->state + * We have two separate sets of flags: task->__state * is about runnability, while task->exit_state are * about the task exiting. Confusing, but this way * modifying one set can't modify the other one by * mistake. */ -#define TASK_RUNNING 0 -#define TASK_INTERRUPTIBLE 1 -#define TASK_UNINTERRUPTIBLE 2 -#define __TASK_STOPPED 4 -#define __TASK_TRACED 8 -/* in tsk->exit_state */ -#define EXIT_ZOMBIE 16 -#define EXIT_DEAD 32 -/* in tsk->state again */ -#define TASK_DEAD 64 -#define TASK_WAKEKILL 128 -#define TASK_WAKING 256 -#define TASK_PARKED 512 -#define TASK_STATE_MAX 1024 - -#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP" - -extern char ___assert_task_state[1 - 2*!!( - sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; - -/* Convenience macros for the sake of set_task_state */ -#define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) -#define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) -#define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED) - -/* Convenience macros for the sake of wake_up */ -#define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) -#define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED) - -/* get_task_state() */ -#define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ - TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ - __TASK_TRACED) - -#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) -#define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) -#define task_is_dead(task) ((task)->exit_state != 0) -#define task_is_stopped_or_traced(task) \ - ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) -#define task_contributes_to_load(task) \ - ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ - (task->flags & PF_FROZEN) == 0) - -#define __set_task_state(tsk, state_value) \ - do { (tsk)->state = (state_value); } while (0) -#define set_task_state(tsk, state_value) \ - set_mb((tsk)->state, (state_value)) + +/* Used in tsk->__state: */ +#define TASK_RUNNING 0x00000000 +#define TASK_INTERRUPTIBLE 0x00000001 +#define TASK_UNINTERRUPTIBLE 0x00000002 +#define __TASK_STOPPED 0x00000004 +#define __TASK_TRACED 0x00000008 +/* Used in tsk->exit_state: */ +#define EXIT_DEAD 0x00000010 +#define EXIT_ZOMBIE 0x00000020 +#define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) +/* Used in tsk->__state again: */ +#define TASK_PARKED 0x00000040 +#define TASK_DEAD 0x00000080 +#define TASK_WAKEKILL 0x00000100 +#define TASK_WAKING 0x00000200 +#define TASK_NOLOAD 0x00000400 +#define TASK_NEW 0x00000800 +#define TASK_RTLOCK_WAIT 0x00001000 +#define TASK_FREEZABLE 0x00002000 +#define __TASK_FREEZABLE_UNSAFE (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP)) +#define TASK_FROZEN 0x00008000 +#define TASK_STATE_MAX 0x00010000 + +#define TASK_ANY (TASK_STATE_MAX-1) /* - * set_current_state() includes a barrier so that the write of current->state - * is correctly serialised wrt the caller's subsequent test of whether to - * actually sleep: - * - * set_current_state(TASK_UNINTERRUPTIBLE); - * if (do_i_need_to_sleep()) - * schedule(); - * - * If the caller does not need such serialisation then use __set_current_state() + * DO NOT ADD ANY NEW USERS ! */ -#define __set_current_state(state_value) \ - do { current->state = (state_value); } while (0) -#define set_current_state(state_value) \ - set_mb(current->state, (state_value)) +#define TASK_FREEZABLE_UNSAFE (TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE) -/* Task command name length */ -#define TASK_COMM_LEN 16 +/* Convenience macros for the sake of set_current_state: */ +#define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) +#define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) +#define TASK_TRACED __TASK_TRACED -#include <linux/spinlock.h> +#define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD) -/* - * This serializes "schedule()" and also protects - * the run-queue from deletions/modifications (but - * _adding_ to the beginning of the run-queue has - * a separate lock). - */ -extern rwlock_t tasklist_lock; -extern spinlock_t mmlist_lock; +/* Convenience macros for the sake of wake_up(): */ +#define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) -struct task_struct; +/* get_task_state(): */ +#define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ + TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ + __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ + TASK_PARKED) -#ifdef CONFIG_PROVE_RCU -extern int lockdep_tasklist_lock_is_held(void); -#endif /* #ifdef CONFIG_PROVE_RCU */ +#define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) -extern void sched_init(void); -extern void sched_init_smp(void); -extern asmlinkage void schedule_tail(struct task_struct *prev); -extern void init_idle(struct task_struct *idle, int cpu); -extern void init_idle_bootup_task(struct task_struct *idle); +#define task_is_traced(task) ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0) +#define task_is_stopped(task) ((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0) +#define task_is_stopped_or_traced(task) ((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0) -extern int runqueue_is_locked(int cpu); +/* + * Special states are those that do not use the normal wait-loop pattern. See + * the comment with set_special_state(). + */ +#define is_special_task_state(state) \ + ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | \ + TASK_DEAD | TASK_FROZEN)) + +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP +# define debug_normal_state_change(state_value) \ + do { \ + WARN_ON_ONCE(is_special_task_state(state_value)); \ + current->task_state_change = _THIS_IP_; \ + } while (0) + +# define debug_special_state_change(state_value) \ + do { \ + WARN_ON_ONCE(!is_special_task_state(state_value)); \ + current->task_state_change = _THIS_IP_; \ + } while (0) + +# define debug_rtlock_wait_set_state() \ + do { \ + current->saved_state_change = current->task_state_change;\ + current->task_state_change = _THIS_IP_; \ + } while (0) + +# define debug_rtlock_wait_restore_state() \ + do { \ + current->task_state_change = current->saved_state_change;\ + } while (0) -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) -extern void nohz_balance_enter_idle(int cpu); -extern void set_cpu_sd_state_idle(void); -extern int get_nohz_timer_target(void); #else -static inline void nohz_balance_enter_idle(int cpu) { } -static inline void set_cpu_sd_state_idle(void) { } +# define debug_normal_state_change(cond) do { } while (0) +# define debug_special_state_change(cond) do { } while (0) +# define debug_rtlock_wait_set_state() do { } while (0) +# define debug_rtlock_wait_restore_state() do { } while (0) #endif +#define trace_set_current_state(state_value) \ + do { \ + if (tracepoint_enabled(sched_set_state_tp)) \ + __trace_set_current_state(state_value); \ + } while (0) + /* - * Only dump TASK_* tasks. (0 for all tasks) + * set_current_state() includes a barrier so that the write of current->__state + * is correctly serialised wrt the caller's subsequent test of whether to + * actually sleep: + * + * for (;;) { + * set_current_state(TASK_UNINTERRUPTIBLE); + * if (CONDITION) + * break; + * + * schedule(); + * } + * __set_current_state(TASK_RUNNING); + * + * If the caller does not need such serialisation (because, for instance, the + * CONDITION test and condition change and wakeup are under the same lock) then + * use __set_current_state(). + * + * The above is typically ordered against the wakeup, which does: + * + * CONDITION = 1; + * wake_up_state(p, TASK_UNINTERRUPTIBLE); + * + * where wake_up_state()/try_to_wake_up() executes a full memory barrier before + * accessing p->__state. + * + * Wakeup will do: if (@state & p->__state) p->__state = TASK_RUNNING, that is, + * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a + * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). + * + * However, with slightly different timing the wakeup TASK_RUNNING store can + * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not + * a problem either because that will result in one extra go around the loop + * and our @cond test will save the day. + * + * Also see the comments of try_to_wake_up(). */ -extern void show_state_filter(unsigned long state_filter); - -static inline void show_state(void) -{ - show_state_filter(0); -} +#define __set_current_state(state_value) \ + do { \ + debug_normal_state_change((state_value)); \ + trace_set_current_state(state_value); \ + WRITE_ONCE(current->__state, (state_value)); \ + } while (0) + +#define set_current_state(state_value) \ + do { \ + debug_normal_state_change((state_value)); \ + trace_set_current_state(state_value); \ + smp_store_mb(current->__state, (state_value)); \ + } while (0) -extern void show_regs(struct pt_regs *); +/* + * set_special_state() should be used for those states when the blocking task + * can not use the regular condition based wait-loop. In that case we must + * serialize against wakeups such that any possible in-flight TASK_RUNNING + * stores will not collide with our state change. + */ +#define set_special_state(state_value) \ + do { \ + unsigned long flags; /* may shadow */ \ + \ + raw_spin_lock_irqsave(¤t->pi_lock, flags); \ + debug_special_state_change((state_value)); \ + trace_set_current_state(state_value); \ + WRITE_ONCE(current->__state, (state_value)); \ + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \ + } while (0) /* - * TASK is a pointer to the task whose backtrace we want to see (or NULL for current - * task), SP is the stack pointer of the first frame that should be shown in the back - * trace (or NULL if the entire call-chain of the task should be shown). + * PREEMPT_RT specific variants for "sleeping" spin/rwlocks + * + * RT's spin/rwlock substitutions are state preserving. The state of the + * task when blocking on the lock is saved in task_struct::saved_state and + * restored after the lock has been acquired. These operations are + * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT + * lock related wakeups while the task is blocked on the lock are + * redirected to operate on task_struct::saved_state to ensure that these + * are not dropped. On restore task_struct::saved_state is set to + * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail. + * + * The lock operation looks like this: + * + * current_save_and_set_rtlock_wait_state(); + * for (;;) { + * if (try_lock()) + * break; + * raw_spin_unlock_irq(&lock->wait_lock); + * schedule_rtlock(); + * raw_spin_lock_irq(&lock->wait_lock); + * set_current_state(TASK_RTLOCK_WAIT); + * } + * current_restore_rtlock_saved_state(); */ -extern void show_stack(struct task_struct *task, unsigned long *sp); - -void io_schedule(void); -long io_schedule_timeout(long timeout); - -extern void cpu_init (void); -extern void trap_init(void); -extern void update_process_times(int user); -extern void scheduler_tick(void); - -extern void sched_show_task(struct task_struct *p); - -#ifdef CONFIG_LOCKUP_DETECTOR -extern void touch_softlockup_watchdog(void); -extern void touch_softlockup_watchdog_sync(void); -extern void touch_all_softlockup_watchdogs(void); -extern int proc_dowatchdog_thresh(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos); -extern unsigned int softlockup_panic; -void lockup_detector_init(void); -#else -static inline void touch_softlockup_watchdog(void) -{ -} -static inline void touch_softlockup_watchdog_sync(void) -{ -} -static inline void touch_all_softlockup_watchdogs(void) -{ -} -static inline void lockup_detector_init(void) -{ -} -#endif +#define current_save_and_set_rtlock_wait_state() \ + do { \ + lockdep_assert_irqs_disabled(); \ + raw_spin_lock(¤t->pi_lock); \ + current->saved_state = current->__state; \ + debug_rtlock_wait_set_state(); \ + trace_set_current_state(TASK_RTLOCK_WAIT); \ + WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \ + raw_spin_unlock(¤t->pi_lock); \ + } while (0); + +#define current_restore_rtlock_saved_state() \ + do { \ + lockdep_assert_irqs_disabled(); \ + raw_spin_lock(¤t->pi_lock); \ + debug_rtlock_wait_restore_state(); \ + trace_set_current_state(current->saved_state); \ + WRITE_ONCE(current->__state, current->saved_state); \ + current->saved_state = TASK_RUNNING; \ + raw_spin_unlock(¤t->pi_lock); \ + } while (0); + +#define get_current_state() READ_ONCE(current->__state) -/* Attach to any functions which should be ignored in wchan output. */ -#define __sched __attribute__((__section__(".sched.text"))) +/* + * Define the task command name length as enum, then it can be visible to + * BPF programs. + */ +enum { + TASK_COMM_LEN = 16, +}; -/* Linker adds these: start and end of __sched functions */ -extern char __sched_text_start[], __sched_text_end[]; +extern void sched_tick(void); -/* Is this address in the __sched functions? */ -extern int in_sched_functions(unsigned long addr); +#define MAX_SCHEDULE_TIMEOUT LONG_MAX -#define MAX_SCHEDULE_TIMEOUT LONG_MAX -extern signed long schedule_timeout(signed long timeout); -extern signed long schedule_timeout_interruptible(signed long timeout); -extern signed long schedule_timeout_killable(signed long timeout); -extern signed long schedule_timeout_uninterruptible(signed long timeout); +extern long schedule_timeout(long timeout); +extern long schedule_timeout_interruptible(long timeout); +extern long schedule_timeout_killable(long timeout); +extern long schedule_timeout_uninterruptible(long timeout); +extern long schedule_timeout_idle(long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); +asmlinkage void preempt_schedule_irq(void); +#ifdef CONFIG_PREEMPT_RT + extern void schedule_rtlock(void); +#endif -struct nsproxy; -struct user_namespace; +extern int __must_check io_schedule_prepare(void); +extern void io_schedule_finish(int token); +extern long io_schedule_timeout(long timeout); +extern void io_schedule(void); -#ifdef CONFIG_MMU -extern void arch_pick_mmap_layout(struct mm_struct *mm); -extern unsigned long -arch_get_unmapped_area(struct file *, unsigned long, unsigned long, - unsigned long, unsigned long); -extern unsigned long -arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags); -#else -static inline void arch_pick_mmap_layout(struct mm_struct *mm) {} +/* wrapper functions to trace from this header file */ +DECLARE_TRACEPOINT(sched_set_state_tp); +extern void __trace_set_current_state(int state_value); +DECLARE_TRACEPOINT(sched_set_need_resched_tp); +extern void __trace_set_need_resched(struct task_struct *curr, int tif); + +/** + * struct prev_cputime - snapshot of system and user cputime + * @utime: time spent in user mode + * @stime: time spent in system mode + * @lock: protects the above two fields + * + * Stores previous user/system time values such that we can guarantee + * monotonicity. + */ +struct prev_cputime { +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + u64 utime; + u64 stime; + raw_spinlock_t lock; #endif +}; +enum vtime_state { + /* Task is sleeping or running in a CPU with VTIME inactive: */ + VTIME_INACTIVE = 0, + /* Task is idle */ + VTIME_IDLE, + /* Task runs in kernelspace in a CPU with VTIME active: */ + VTIME_SYS, + /* Task runs in userspace in a CPU with VTIME active: */ + VTIME_USER, + /* Task runs as guests in a CPU with VTIME active: */ + VTIME_GUEST, +}; -extern void set_dumpable(struct mm_struct *mm, int value); -extern int get_dumpable(struct mm_struct *mm); +struct vtime { + seqcount_t seqcount; + unsigned long long starttime; + enum vtime_state state; + unsigned int cpu; + u64 utime; + u64 stime; + u64 gtime; +}; -/* mm flags */ -/* dumpable bits */ -#define MMF_DUMPABLE 0 /* core dump is permitted */ -#define MMF_DUMP_SECURELY 1 /* core file is readable only by root */ +/* + * Utilization clamp constraints. + * @UCLAMP_MIN: Minimum utilization + * @UCLAMP_MAX: Maximum utilization + * @UCLAMP_CNT: Utilization clamp constraints count + */ +enum uclamp_id { + UCLAMP_MIN = 0, + UCLAMP_MAX, + UCLAMP_CNT +}; -#define MMF_DUMPABLE_BITS 2 -#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) +extern struct root_domain def_root_domain; +extern struct mutex sched_domains_mutex; +extern void sched_domains_mutex_lock(void); +extern void sched_domains_mutex_unlock(void); -/* coredump filter bits */ -#define MMF_DUMP_ANON_PRIVATE 2 -#define MMF_DUMP_ANON_SHARED 3 -#define MMF_DUMP_MAPPED_PRIVATE 4 -#define MMF_DUMP_MAPPED_SHARED 5 -#define MMF_DUMP_ELF_HEADERS 6 -#define MMF_DUMP_HUGETLB_PRIVATE 7 -#define MMF_DUMP_HUGETLB_SHARED 8 +struct sched_param { + int sched_priority; +}; -#define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS -#define MMF_DUMP_FILTER_BITS 7 -#define MMF_DUMP_FILTER_MASK \ - (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) -#define MMF_DUMP_FILTER_DEFAULT \ - ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\ - (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) +struct sched_info { +#ifdef CONFIG_SCHED_INFO + /* Cumulative counters: */ -#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS -# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS) -#else -# define MMF_DUMP_MASK_DEFAULT_ELF 0 -#endif - /* leave room for more dump flags */ -#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ -#define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ -#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ + /* # of times we have run on this CPU: */ + unsigned long pcount; -#define MMF_HAS_UPROBES 19 /* has uprobes */ -#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ + /* Time spent waiting on a runqueue: */ + unsigned long long run_delay; -#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) + /* Max time spent waiting on a runqueue: */ + unsigned long long max_run_delay; -struct sighand_struct { - atomic_t count; - struct k_sigaction action[_NSIG]; - spinlock_t siglock; - wait_queue_head_t signalfd_wqh; -}; + /* Min time spent waiting on a runqueue: */ + unsigned long long min_run_delay; -struct pacct_struct { - int ac_flag; - long ac_exitcode; - unsigned long ac_mem; - cputime_t ac_utime, ac_stime; - unsigned long ac_minflt, ac_majflt; -}; + /* Timestamps: */ -struct cpu_itimer { - cputime_t expires; - cputime_t incr; - u32 error; - u32 incr_error; -}; + /* When did we last run on a CPU? */ + unsigned long long last_arrival; -/** - * struct cputime - snaphsot of system and user cputime - * @utime: time spent in user mode - * @stime: time spent in system mode - * - * Gathers a generic snapshot of user and system time. - */ -struct cputime { - cputime_t utime; - cputime_t stime; + /* When were we last queued to run? */ + unsigned long long last_queued; + +#endif /* CONFIG_SCHED_INFO */ }; -/** - * struct task_cputime - collected CPU time counts - * @utime: time spent in user mode, in &cputime_t units - * @stime: time spent in kernel mode, in &cputime_t units - * @sum_exec_runtime: total time spent on the CPU, in nanoseconds - * - * This is an extension of struct cputime that includes the total runtime - * spent by the task from the scheduler point of view. +/* + * Integer metrics need fixed point arithmetic, e.g., sched/fair + * has a few: load, load_avg, util_avg, freq, and capacity. * - * As a result, this structure groups together three kinds of CPU time - * that are tracked for threads and thread groups. Most things considering - * CPU time want to group these counts together and treat all three - * of them in parallel. + * We define a basic fixed point arithmetic range, and then formalize + * all these metrics based on that basic range. */ -struct task_cputime { - cputime_t utime; - cputime_t stime; - unsigned long long sum_exec_runtime; +# define SCHED_FIXEDPOINT_SHIFT 10 +# define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) + +/* Increase resolution of cpu_capacity calculations */ +# define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT +# define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) + +struct load_weight { + unsigned long weight; + u32 inv_weight; }; -/* Alternate field names when used to cache expirations. */ -#define prof_exp stime -#define virt_exp utime -#define sched_exp sum_exec_runtime - -#define INIT_CPUTIME \ - (struct task_cputime) { \ - .utime = 0, \ - .stime = 0, \ - .sum_exec_runtime = 0, \ - } /* - * Disable preemption until the scheduler is running. - * Reset by start_kernel()->sched_init()->init_idle(). + * The load/runnable/util_avg accumulates an infinite geometric series + * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). * - * We include PREEMPT_ACTIVE to avoid cond_resched() from working - * before the scheduler is active -- see should_resched(). - */ -#define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE) - -/** - * struct thread_group_cputimer - thread group interval timer counts - * @cputime: thread group interval timers. - * @running: non-zero when there are timers running and - * @cputime receives updates. - * @lock: lock for fields in this struct. + * [load_avg definition] + * + * load_avg = runnable% * scale_load_down(load) + * + * [runnable_avg definition] + * + * runnable_avg = runnable% * SCHED_CAPACITY_SCALE + * + * [util_avg definition] + * + * util_avg = running% * SCHED_CAPACITY_SCALE + * + * where runnable% is the time ratio that a sched_entity is runnable and + * running% the time ratio that a sched_entity is running. + * + * For cfs_rq, they are the aggregated values of all runnable and blocked + * sched_entities. + * + * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU + * capacity scaling. The scaling is done through the rq_clock_pelt that is used + * for computing those signals (see update_rq_clock_pelt()) + * + * N.B., the above ratios (runnable% and running%) themselves are in the + * range of [0, 1]. To do fixed point arithmetics, we therefore scale them + * to as large a range as necessary. This is for example reflected by + * util_avg's SCHED_CAPACITY_SCALE. + * + * [Overflow issue] + * + * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities + * with the highest load (=88761), always runnable on a single cfs_rq, + * and should not overflow as the number already hits PID_MAX_LIMIT. * - * This structure contains the version of task_cputime, above, that is - * used for thread group CPU timer calculations. + * For all other cases (including 32-bit kernels), struct load_weight's + * weight will overflow first before we do, because: + * + * Max(load_avg) <= Max(load.weight) + * + * Then it is the load_weight's responsibility to consider overflow + * issues. */ -struct thread_group_cputimer { - struct task_cputime cputime; - int running; - raw_spinlock_t lock; -}; - -#include <linux/rwsem.h> -struct autogroup; +struct sched_avg { + u64 last_update_time; + u64 load_sum; + u64 runnable_sum; + u32 util_sum; + u32 period_contrib; + unsigned long load_avg; + unsigned long runnable_avg; + unsigned long util_avg; + unsigned int util_est; +} ____cacheline_aligned; /* - * NOTE! "signal_struct" does not have its own - * locking, because a shared signal_struct always - * implies a shared sighand_struct, so locking - * sighand_struct is always a proper superset of - * the locking of signal_struct. + * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg + * updates. When a task is dequeued, its util_est should not be updated if its + * util_avg has not been updated in the meantime. + * This information is mapped into the MSB bit of util_est at dequeue time. + * Since max value of util_est for a task is 1024 (PELT util_avg for a task) + * it is safe to use MSB. */ -struct signal_struct { - atomic_t sigcnt; - atomic_t live; - int nr_threads; - - wait_queue_head_t wait_chldexit; /* for wait4() */ +#define UTIL_EST_WEIGHT_SHIFT 2 +#define UTIL_AVG_UNCHANGED 0x80000000 - /* current thread group signal load-balancing target: */ - struct task_struct *curr_target; +struct sched_statistics { +#ifdef CONFIG_SCHEDSTATS + u64 wait_start; + u64 wait_max; + u64 wait_count; + u64 wait_sum; + u64 iowait_count; + u64 iowait_sum; + + u64 sleep_start; + u64 sleep_max; + s64 sum_sleep_runtime; + + u64 block_start; + u64 block_max; + s64 sum_block_runtime; + + s64 exec_max; + u64 slice_max; + + u64 nr_migrations_cold; + u64 nr_failed_migrations_affine; + u64 nr_failed_migrations_running; + u64 nr_failed_migrations_hot; + u64 nr_forced_migrations; + + u64 nr_wakeups; + u64 nr_wakeups_sync; + u64 nr_wakeups_migrate; + u64 nr_wakeups_local; + u64 nr_wakeups_remote; + u64 nr_wakeups_affine; + u64 nr_wakeups_affine_attempts; + u64 nr_wakeups_passive; + u64 nr_wakeups_idle; + +#ifdef CONFIG_SCHED_CORE + u64 core_forceidle_sum; +#endif +#endif /* CONFIG_SCHEDSTATS */ +} ____cacheline_aligned; - /* shared signal handling: */ - struct sigpending shared_pending; +struct sched_entity { + /* For load-balancing: */ + struct load_weight load; + struct rb_node run_node; + u64 deadline; + u64 min_vruntime; + u64 min_slice; + + struct list_head group_node; + unsigned char on_rq; + unsigned char sched_delayed; + unsigned char rel_deadline; + unsigned char custom_slice; + /* hole */ + + u64 exec_start; + u64 sum_exec_runtime; + u64 prev_sum_exec_runtime; + u64 vruntime; + union { + /* + * When !@on_rq this field is vlag. + * When cfs_rq->curr == se (which implies @on_rq) + * this field is vprot. See protect_slice(). + */ + s64 vlag; + u64 vprot; + }; + u64 slice; - /* thread group exit support */ - int group_exit_code; - /* overloaded: - * - notify group_exit_task when ->count is equal to notify_count - * - everyone except group_exit_task is stopped during signal delivery - * of fatal signals, group_exit_task processes the signal. - */ - int notify_count; - struct task_struct *group_exit_task; + u64 nr_migrations; - /* thread group stop support, overloads group_exit_code too */ - int group_stop_count; - unsigned int flags; /* see SIGNAL_* flags below */ +#ifdef CONFIG_FAIR_GROUP_SCHED + int depth; + struct sched_entity *parent; + /* rq on which this entity is (to be) queued: */ + struct cfs_rq *cfs_rq; + /* rq "owned" by this entity/group: */ + struct cfs_rq *my_q; + /* cached value of my_q->h_nr_running */ + unsigned long runnable_weight; +#endif /* - * PR_SET_CHILD_SUBREAPER marks a process, like a service - * manager, to re-parent orphan (double-forking) child processes - * to this process instead of 'init'. The service manager is - * able to receive SIGCHLD signals and is able to investigate - * the process until it calls wait(). All children of this - * process will inherit a flag if they should look for a - * child_subreaper process at exit. + * Per entity load average tracking. + * + * Put into separate cache line so it does not + * collide with read-mostly values above. */ - unsigned int is_child_subreaper:1; - unsigned int has_child_subreaper:1; + struct sched_avg avg; +}; + +struct sched_rt_entity { + struct list_head run_list; + unsigned long timeout; + unsigned long watchdog_stamp; + unsigned int time_slice; + unsigned short on_rq; + unsigned short on_list; + + struct sched_rt_entity *back; +#ifdef CONFIG_RT_GROUP_SCHED + struct sched_rt_entity *parent; + /* rq on which this entity is (to be) queued: */ + struct rt_rq *rt_rq; + /* rq "owned" by this entity/group: */ + struct rt_rq *my_q; +#endif +} __randomize_layout; - /* POSIX.1b Interval Timers */ - int posix_timer_id; - struct list_head posix_timers; +struct rq_flags; +typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *, struct rq_flags *rf); - /* ITIMER_REAL timer for the process */ - struct hrtimer real_timer; - struct pid *leader_pid; - ktime_t it_real_incr; +struct sched_dl_entity { + struct rb_node rb_node; /* - * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use - * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these - * values are defined to 0 and 1 respectively + * Original scheduling parameters. Copied here from sched_attr + * during sched_setattr(), they will remain the same until + * the next sched_setattr(). */ - struct cpu_itimer it[2]; + u64 dl_runtime; /* Maximum runtime for each instance */ + u64 dl_deadline; /* Relative deadline of each instance */ + u64 dl_period; /* Separation of two instances (period) */ + u64 dl_bw; /* dl_runtime / dl_period */ + u64 dl_density; /* dl_runtime / dl_deadline */ /* - * Thread group totals for process CPU timers. - * See thread_group_cputimer(), et al, for details. + * Actual scheduling parameters. Initialized with the values above, + * they are continuously updated during task execution. Note that + * the remaining runtime could be < 0 in case we are in overrun. */ - struct thread_group_cputimer cputimer; - - /* Earliest-expiration cache. */ - struct task_cputime cputime_expires; - - struct list_head cpu_timers[3]; - - struct pid *tty_old_pgrp; - - /* boolean value for session group leader */ - int leader; + s64 runtime; /* Remaining runtime for this instance */ + u64 deadline; /* Absolute deadline for this instance */ + unsigned int flags; /* Specifying the scheduler behaviour */ - struct tty_struct *tty; /* NULL if no tty */ + /* + * Some bool flags: + * + * @dl_throttled tells if we exhausted the runtime. If so, the + * task has to wait for a replenishment to be performed at the + * next firing of dl_timer. + * + * @dl_yielded tells if task gave up the CPU before consuming + * all its available runtime during the last job. + * + * @dl_non_contending tells if the task is inactive while still + * contributing to the active utilization. In other words, it + * indicates if the inactive timer has been armed and its handler + * has not been executed yet. This flag is useful to avoid race + * conditions between the inactive timer handler and the wakeup + * code. + * + * @dl_overrun tells if the task asked to be informed about runtime + * overruns. + * + * @dl_server tells if this is a server entity. + * + * @dl_server_active tells if the dlserver is active(started). + * dlserver is started on first cfs enqueue on an idle runqueue + * and is stopped when a dequeue results in 0 cfs tasks on the + * runqueue. In other words, dlserver is active only when cpu's + * runqueue has atleast one cfs task. + * + * @dl_defer tells if this is a deferred or regular server. For + * now only defer server exists. + * + * @dl_defer_armed tells if the deferrable server is waiting + * for the replenishment timer to activate it. + * + * @dl_defer_running tells if the deferrable server is actually + * running, skipping the defer phase. + * + * @dl_defer_idle tracks idle state + */ + unsigned int dl_throttled : 1; + unsigned int dl_yielded : 1; + unsigned int dl_non_contending : 1; + unsigned int dl_overrun : 1; + unsigned int dl_server : 1; + unsigned int dl_server_active : 1; + unsigned int dl_defer : 1; + unsigned int dl_defer_armed : 1; + unsigned int dl_defer_running : 1; + unsigned int dl_defer_idle : 1; -#ifdef CONFIG_SCHED_AUTOGROUP - struct autogroup *autogroup; -#endif /* - * Cumulative resource counters for dead threads in the group, - * and for reaped dead child processes forked by this group. - * Live threads maintain their own counters and add to these - * in __exit_signal, except for the group leader. + * Bandwidth enforcement timer. Each -deadline task has its + * own bandwidth to be enforced, thus we need one timer per task. */ - cputime_t utime, stime, cutime, cstime; - cputime_t gtime; - cputime_t cgtime; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - struct cputime prev_cputime; -#endif - unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; - unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; - unsigned long inblock, oublock, cinblock, coublock; - unsigned long maxrss, cmaxrss; - struct task_io_accounting ioac; + struct hrtimer dl_timer; /* - * Cumulative ns of schedule CPU time fo dead threads in the - * group, not including a zombie group leader, (This only differs - * from jiffies_to_ns(utime + stime) if sched_clock uses something - * other than jiffies.) + * Inactive timer, responsible for decreasing the active utilization + * at the "0-lag time". When a -deadline task blocks, it contributes + * to GRUB's active utilization until the "0-lag time", hence a + * timer is needed to decrease the active utilization at the correct + * time. */ - unsigned long long sum_sched_runtime; + struct hrtimer inactive_timer; /* - * We don't bother to synchronize most readers of this at all, - * because there is no reader checking a limit that actually needs - * to get both rlim_cur and rlim_max atomically, and either one - * alone is a single word that can safely be read normally. - * getrlimit/setrlimit use task_lock(current->group_leader) to - * protect this instead of the siglock, because they really - * have no need to disable irqs. + * Bits for DL-server functionality. Also see the comment near + * dl_server_update(). + * + * @rq the runqueue this server is for */ - struct rlimit rlim[RLIM_NLIMITS]; + struct rq *rq; + dl_server_pick_f server_pick_task; -#ifdef CONFIG_BSD_PROCESS_ACCT - struct pacct_struct pacct; /* per-process accounting information */ -#endif -#ifdef CONFIG_TASKSTATS - struct taskstats *stats; -#endif -#ifdef CONFIG_AUDIT - unsigned audit_tty; - unsigned audit_tty_log_passwd; - struct tty_audit_buf *tty_audit_buf; -#endif -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_RT_MUTEXES /* - * group_rwsem prevents new tasks from entering the threadgroup and - * member tasks from exiting,a more specifically, setting of - * PF_EXITING. fork and exit paths are protected with this rwsem - * using threadgroup_change_begin/end(). Users which require - * threadgroup to remain stable should use threadgroup_[un]lock() - * which also takes care of exec path. Currently, cgroup is the - * only user. + * Priority Inheritance. When a DEADLINE scheduling entity is boosted + * pi_se points to the donor, otherwise points to the dl_se it belongs + * to (the original one/itself). */ - struct rw_semaphore group_rwsem; + struct sched_dl_entity *pi_se; #endif - - oom_flags_t oom_flags; - short oom_score_adj; /* OOM kill score adjustment */ - short oom_score_adj_min; /* OOM kill score adjustment min value. - * Only settable by CAP_SYS_RESOURCE. */ - - struct mutex cred_guard_mutex; /* guard against foreign influences on - * credential calculations - * (notably. ptrace) */ }; -/* - * Bits in flags field of signal_struct. - */ -#define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ -#define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */ -#define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */ -#define SIGNAL_GROUP_COREDUMP 0x00000008 /* coredump in progress */ -/* - * Pending notifications to parent. - */ -#define SIGNAL_CLD_STOPPED 0x00000010 -#define SIGNAL_CLD_CONTINUED 0x00000020 -#define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED) - -#define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */ - -/* If true, all threads except ->group_exit_task have pending SIGKILL */ -static inline int signal_group_exit(const struct signal_struct *sig) -{ - return (sig->flags & SIGNAL_GROUP_EXIT) || - (sig->group_exit_task != NULL); -} +#ifdef CONFIG_UCLAMP_TASK +/* Number of utilization clamp buckets (shorter alias) */ +#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT /* - * Some day this will be a full-fledged user tracking system.. + * Utilization clamp for a scheduling entity + * @value: clamp value "assigned" to a se + * @bucket_id: bucket index corresponding to the "assigned" value + * @active: the se is currently refcounted in a rq's bucket + * @user_defined: the requested clamp value comes from user-space + * + * The bucket_id is the index of the clamp bucket matching the clamp value + * which is pre-computed and stored to avoid expensive integer divisions from + * the fast path. + * + * The active bit is set whenever a task has got an "effective" value assigned, + * which can be different from the clamp value "requested" from user-space. + * This allows to know a task is refcounted in the rq's bucket corresponding + * to the "effective" bucket_id. + * + * The user_defined bit is set whenever a task has got a task-specific clamp + * value requested from userspace, i.e. the system defaults apply to this task + * just as a restriction. This allows to relax default clamps when a less + * restrictive task-specific value has been requested, thus allowing to + * implement a "nice" semantic. For example, a task running with a 20% + * default boost can still drop its own boosting to 0%. */ -struct user_struct { - atomic_t __count; /* reference count */ - atomic_t processes; /* How many processes does this user have? */ - atomic_t files; /* How many open files does this user have? */ - atomic_t sigpending; /* How many pending signals does this user have? */ -#ifdef CONFIG_INOTIFY_USER - atomic_t inotify_watches; /* How many inotify watches does this user have? */ - atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ -#endif -#ifdef CONFIG_FANOTIFY - atomic_t fanotify_listeners; -#endif -#ifdef CONFIG_EPOLL - atomic_long_t epoll_watches; /* The number of file descriptors currently watched */ -#endif -#ifdef CONFIG_POSIX_MQUEUE - /* protected by mq_lock */ - unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ -#endif - unsigned long locked_shm; /* How many pages of mlocked shm ? */ - -#ifdef CONFIG_KEYS - struct key *uid_keyring; /* UID specific keyring */ - struct key *session_keyring; /* UID's default session keyring */ -#endif - - /* Hash table maintenance information */ - struct hlist_node uidhash_node; - kuid_t uid; - -#ifdef CONFIG_PERF_EVENTS - atomic_long_t locked_vm; -#endif -}; - -extern int uids_sysfs_init(void); - -extern struct user_struct *find_user(kuid_t); - -extern struct user_struct root_user; -#define INIT_USER (&root_user) - - -struct backing_dev_info; -struct reclaim_state; - -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) -struct sched_info { - /* cumulative counters */ - unsigned long pcount; /* # of times run on this cpu */ - unsigned long long run_delay; /* time spent waiting on a runqueue */ - - /* timestamps */ - unsigned long long last_arrival,/* when we last ran on a cpu */ - last_queued; /* when we were last queued to run */ +struct uclamp_se { + unsigned int value : bits_per(SCHED_CAPACITY_SCALE); + unsigned int bucket_id : bits_per(UCLAMP_BUCKETS); + unsigned int active : 1; + unsigned int user_defined : 1; }; -#endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ - -#ifdef CONFIG_TASK_DELAY_ACCT -struct task_delay_info { - spinlock_t lock; - unsigned int flags; /* Private per-task flags */ - - /* For each stat XXX, add following, aligned appropriately - * - * struct timespec XXX_start, XXX_end; - * u64 XXX_delay; - * u32 XXX_count; - * - * Atomicity of updates to XXX_delay, XXX_count protected by - * single lock above (split into XXX_lock if contention is an issue). - */ - - /* - * XXX_count is incremented on every XXX operation, the delay - * associated with the operation is added to XXX_delay. - * XXX_delay contains the accumulated delay time in nanoseconds. - */ - struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */ - u64 blkio_delay; /* wait for sync block io completion */ - u64 swapin_delay; /* wait for swapin block io completion */ - u32 blkio_count; /* total count of the number of sync block */ - /* io operations performed */ - u32 swapin_count; /* total count of the number of swapin block */ - /* io operations performed */ - - struct timespec freepages_start, freepages_end; - u64 freepages_delay; /* wait for memory reclaim */ - u32 freepages_count; /* total count of memory reclaim */ +#endif /* CONFIG_UCLAMP_TASK */ + +union rcu_special { + struct { + u8 blocked; + u8 need_qs; + u8 exp_hint; /* Hint for performance. */ + u8 need_mb; /* Readers need smp_mb(). */ + } b; /* Bits. */ + u32 s; /* Set of bits. */ }; -#endif /* CONFIG_TASK_DELAY_ACCT */ -static inline int sched_info_on(void) -{ -#ifdef CONFIG_SCHEDSTATS - return 1; -#elif defined(CONFIG_TASK_DELAY_ACCT) - extern int delayacct_on; - return delayacct_on; -#else - return 0; -#endif -} - -enum cpu_idle_type { - CPU_IDLE, - CPU_NOT_IDLE, - CPU_NEWLY_IDLE, - CPU_MAX_IDLE_TYPES +enum perf_event_task_context { + perf_invalid_context = -1, + perf_hw_context = 0, + perf_sw_context, + perf_nr_task_contexts, }; /* - * Increase resolution of cpu_power calculations + * Number of contexts where an event can trigger: + * task, softirq, hardirq, nmi. */ -#define SCHED_POWER_SHIFT 10 -#define SCHED_POWER_SCALE (1L << SCHED_POWER_SHIFT) +#define PERF_NR_CONTEXTS 4 -/* - * sched-domains (multiprocessor balancing) declarations: - */ -#ifdef CONFIG_SMP -#define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */ -#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */ -#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */ -#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ -#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ -#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ -#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ -#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ -#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ -#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ -#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ -#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ - -extern int __weak arch_sd_sibiling_asym_packing(void); - -struct sched_domain_attr { - int relax_domain_level; +struct wake_q_node { + struct wake_q_node *next; }; -#define SD_ATTR_INIT (struct sched_domain_attr) { \ - .relax_domain_level = -1, \ -} - -extern int sched_domain_level_max; - -struct sched_group; - -struct sched_domain { - /* These fields must be setup */ - struct sched_domain *parent; /* top domain must be null terminated */ - struct sched_domain *child; /* bottom domain must be null terminated */ - struct sched_group *groups; /* the balancing groups of the domain */ - unsigned long min_interval; /* Minimum balance interval ms */ - unsigned long max_interval; /* Maximum balance interval ms */ - unsigned int busy_factor; /* less balancing by factor if busy */ - unsigned int imbalance_pct; /* No balance until over watermark */ - unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ - unsigned int busy_idx; - unsigned int idle_idx; - unsigned int newidle_idx; - unsigned int wake_idx; - unsigned int forkexec_idx; - unsigned int smt_gain; - - int nohz_idle; /* NOHZ IDLE status */ - int flags; /* See SD_* */ - int level; - - /* Runtime fields. */ - unsigned long last_balance; /* init to jiffies. units in jiffies */ - unsigned int balance_interval; /* initialise to 1. units in ms. */ - unsigned int nr_balance_failed; /* initialise to 0 */ - - u64 last_update; - -#ifdef CONFIG_SCHEDSTATS - /* load_balance() stats */ - unsigned int lb_count[CPU_MAX_IDLE_TYPES]; - unsigned int lb_failed[CPU_MAX_IDLE_TYPES]; - unsigned int lb_balanced[CPU_MAX_IDLE_TYPES]; - unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES]; - unsigned int lb_gained[CPU_MAX_IDLE_TYPES]; - unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES]; - unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES]; - unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES]; - - /* Active load balancing */ - unsigned int alb_count; - unsigned int alb_failed; - unsigned int alb_pushed; - - /* SD_BALANCE_EXEC stats */ - unsigned int sbe_count; - unsigned int sbe_balanced; - unsigned int sbe_pushed; - - /* SD_BALANCE_FORK stats */ - unsigned int sbf_count; - unsigned int sbf_balanced; - unsigned int sbf_pushed; - - /* try_to_wake_up() stats */ - unsigned int ttwu_wake_remote; - unsigned int ttwu_move_affine; - unsigned int ttwu_move_balance; -#endif -#ifdef CONFIG_SCHED_DEBUG - char *name; +struct kmap_ctrl { +#ifdef CONFIG_KMAP_LOCAL + int idx; + pte_t pteval[KM_MAX_IDX]; #endif - union { - void *private; /* used during construction */ - struct rcu_head rcu; /* used during destruction */ - }; +}; - unsigned int span_weight; +struct task_struct { +#ifdef CONFIG_THREAD_INFO_IN_TASK /* - * Span of all CPUs in this domain. - * - * NOTE: this field is variable length. (Allocated dynamically - * by attaching extra space to the end of the structure, - * depending on how many CPUs the kernel has booted up with) + * For reasons of header soup (see current_thread_info()), this + * must be the first element of task_struct. */ - unsigned long span[0]; -}; - -static inline struct cpumask *sched_domain_span(struct sched_domain *sd) -{ - return to_cpumask(sd->span); -} - -extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new); - -/* Allocate an array of sched domains, for partition_sched_domains(). */ -cpumask_var_t *alloc_sched_domains(unsigned int ndoms); -void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); - -bool cpus_share_cache(int this_cpu, int that_cpu); - -#else /* CONFIG_SMP */ - -struct sched_domain_attr; - -static inline void -partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) -{ -} - -static inline bool cpus_share_cache(int this_cpu, int that_cpu) -{ - return true; -} - -#endif /* !CONFIG_SMP */ - - -struct io_context; /* See blkdev.h */ - - -#ifdef ARCH_HAS_PREFETCH_SWITCH_STACK -extern void prefetch_stack(struct task_struct *t); -#else -static inline void prefetch_stack(struct task_struct *t) { } + struct thread_info thread_info; #endif + unsigned int __state; -struct audit_context; /* See audit.c */ -struct mempolicy; -struct pipe_inode_info; -struct uts_namespace; - -struct load_weight { - unsigned long weight, inv_weight; -}; + /* saved state for "spinlock sleepers" */ + unsigned int saved_state; -struct sched_avg { /* - * These sums represent an infinite geometric series and so are bound - * above by 1024/(1-y). Thus we only need a u32 to store them for all - * choices of y < 1-2^(-32)*1024. + * This begins the randomizable portion of task_struct. Only + * scheduling-critical items should be added above here. */ - u32 runnable_avg_sum, runnable_avg_period; - u64 last_runnable_update; - s64 decay_count; - unsigned long load_avg_contrib; -}; + randomized_struct_fields_start -#ifdef CONFIG_SCHEDSTATS -struct sched_statistics { - u64 wait_start; - u64 wait_max; - u64 wait_count; - u64 wait_sum; - u64 iowait_count; - u64 iowait_sum; - - u64 sleep_start; - u64 sleep_max; - s64 sum_sleep_runtime; - - u64 block_start; - u64 block_max; - u64 exec_max; - u64 slice_max; - - u64 nr_migrations_cold; - u64 nr_failed_migrations_affine; - u64 nr_failed_migrations_running; - u64 nr_failed_migrations_hot; - u64 nr_forced_migrations; - - u64 nr_wakeups; - u64 nr_wakeups_sync; - u64 nr_wakeups_migrate; - u64 nr_wakeups_local; - u64 nr_wakeups_remote; - u64 nr_wakeups_affine; - u64 nr_wakeups_affine_attempts; - u64 nr_wakeups_passive; - u64 nr_wakeups_idle; -}; + void *stack; + refcount_t usage; + /* Per task flags (PF_*), defined further below: */ + unsigned int flags; + unsigned int ptrace; + +#ifdef CONFIG_MEM_ALLOC_PROFILING + struct alloc_tag *alloc_tag; #endif -struct sched_entity { - struct load_weight load; /* for load-balancing */ - struct rb_node run_node; - struct list_head group_node; - unsigned int on_rq; + int on_cpu; + struct __call_single_node wake_entry; + unsigned int wakee_flips; + unsigned long wakee_flip_decay_ts; + struct task_struct *last_wakee; - u64 exec_start; - u64 sum_exec_runtime; - u64 vruntime; - u64 prev_sum_exec_runtime; + /* + * recent_used_cpu is initially set as the last CPU used by a task + * that wakes affine another task. Waker/wakee relationships can + * push tasks around a CPU where each wakeup moves to the next one. + * Tracking a recently used CPU allows a quick search for a recently + * used CPU that may be idle. + */ + int recent_used_cpu; + int wake_cpu; + int on_rq; - u64 nr_migrations; + int prio; + int static_prio; + int normal_prio; + unsigned int rt_priority; -#ifdef CONFIG_SCHEDSTATS - struct sched_statistics statistics; + struct sched_entity se; + struct sched_rt_entity rt; + struct sched_dl_entity dl; + struct sched_dl_entity *dl_server; +#ifdef CONFIG_SCHED_CLASS_EXT + struct sched_ext_entity scx; #endif + const struct sched_class *sched_class; -#ifdef CONFIG_FAIR_GROUP_SCHED - struct sched_entity *parent; - /* rq on which this entity is (to be) queued: */ - struct cfs_rq *cfs_rq; - /* rq "owned" by this entity/group: */ - struct cfs_rq *my_q; +#ifdef CONFIG_SCHED_CORE + struct rb_node core_node; + unsigned long core_cookie; + unsigned int core_occupation; #endif -#ifdef CONFIG_SMP - /* Per-entity load-tracking */ - struct sched_avg avg; +#ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; +#ifdef CONFIG_CFS_BANDWIDTH + struct callback_head sched_throttle_work; + struct list_head throttle_node; + bool throttled; #endif -}; - -struct sched_rt_entity { - struct list_head run_list; - unsigned long timeout; - unsigned long watchdog_stamp; - unsigned int time_slice; - - struct sched_rt_entity *back; -#ifdef CONFIG_RT_GROUP_SCHED - struct sched_rt_entity *parent; - /* rq on which this entity is (to be) queued: */ - struct rt_rq *rt_rq; - /* rq "owned" by this entity/group: */ - struct rt_rq *my_q; #endif -}; -struct rcu_node; - -enum perf_event_task_context { - perf_invalid_context = -1, - perf_hw_context = 0, - perf_sw_context, - perf_nr_task_contexts, -}; - -struct task_struct { - volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ - void *stack; - atomic_t usage; - unsigned int flags; /* per process flags, defined below */ - unsigned int ptrace; - -#ifdef CONFIG_SMP - struct llist_node wake_entry; - int on_cpu; +#ifdef CONFIG_UCLAMP_TASK + /* + * Clamp values requested for a scheduling entity. + * Must be updated with task_rq_lock() held. + */ + struct uclamp_se uclamp_req[UCLAMP_CNT]; + /* + * Effective clamp values used for a scheduling entity. + * Must be updated with task_rq_lock() held. + */ + struct uclamp_se uclamp[UCLAMP_CNT]; #endif - int on_rq; - int prio, static_prio, normal_prio; - unsigned int rt_priority; - const struct sched_class *sched_class; - struct sched_entity se; - struct sched_rt_entity rt; -#ifdef CONFIG_CGROUP_SCHED - struct task_group *sched_task_group; -#endif + struct sched_statistics stats; #ifdef CONFIG_PREEMPT_NOTIFIERS - /* list of struct preempt_notifier: */ - struct hlist_head preempt_notifiers; + /* List of struct preempt_notifier: */ + struct hlist_head preempt_notifiers; #endif - /* - * fpu_counter contains the number of consecutive context switches - * that the FPU is used. If this is over a threshold, the lazy fpu - * saving becomes unlazy to save the trap. This is an unsigned char - * so that after 256 times the counter wraps and the behavior turns - * lazy again; this to deal with bursty apps that only use FPU for - * a short time - */ - unsigned char fpu_counter; #ifdef CONFIG_BLK_DEV_IO_TRACE - unsigned int btrace_seq; + unsigned int btrace_seq; #endif - unsigned int policy; - int nr_cpus_allowed; - cpumask_t cpus_allowed; + unsigned int policy; + unsigned long max_allowed_capacity; + int nr_cpus_allowed; + const cpumask_t *cpus_ptr; + cpumask_t *user_cpus_ptr; + cpumask_t cpus_mask; + void *migration_pending; + unsigned short migration_disabled; + unsigned short migration_flags; #ifdef CONFIG_PREEMPT_RCU - int rcu_read_lock_nesting; - char rcu_read_unlock_special; - struct list_head rcu_node_entry; + int rcu_read_lock_nesting; + union rcu_special rcu_read_unlock_special; + struct list_head rcu_node_entry; + struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_PREEMPT_RCU */ -#ifdef CONFIG_TREE_PREEMPT_RCU - struct rcu_node *rcu_blocked_node; -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -#ifdef CONFIG_RCU_BOOST - struct rt_mutex *rcu_boost_mutex; -#endif /* #ifdef CONFIG_RCU_BOOST */ -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) - struct sched_info sched_info; -#endif +#ifdef CONFIG_TASKS_RCU + unsigned long rcu_tasks_nvcsw; + u8 rcu_tasks_holdout; + u8 rcu_tasks_idx; + int rcu_tasks_idle_cpu; + struct list_head rcu_tasks_holdout_list; + int rcu_tasks_exit_cpu; + struct list_head rcu_tasks_exit_list; +#endif /* #ifdef CONFIG_TASKS_RCU */ + +#ifdef CONFIG_TASKS_TRACE_RCU + int trc_reader_nesting; + int trc_ipi_to_cpu; + union rcu_special trc_reader_special; + struct list_head trc_holdout_list; + struct list_head trc_blkd_node; + int trc_blkd_cpu; +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ + + struct sched_info sched_info; + + struct list_head tasks; + struct plist_node pushable_tasks; + struct rb_node pushable_dl_tasks; + + struct mm_struct *mm; + struct mm_struct *active_mm; + struct address_space *faults_disabled_mapping; + + int exit_state; + int exit_code; + int exit_signal; + /* The signal sent when the parent dies: */ + int pdeath_signal; + /* JOBCTL_*, siglock protected: */ + unsigned long jobctl; + + /* Used for emulating ABI behavior of previous Linux versions: */ + unsigned int personality; + + /* Scheduler bits, serialized by scheduler locks: */ + unsigned sched_reset_on_fork:1; + unsigned sched_contributes_to_load:1; + unsigned sched_migrated:1; + unsigned sched_task_hot:1; + + /* Force alignment to the next boundary: */ + unsigned :0; + + /* Unserialized, strictly 'current' */ - struct list_head tasks; -#ifdef CONFIG_SMP - struct plist_node pushable_tasks; + /* + * This field must not be in the scheduler word above due to wakelist + * queueing no longer being serialized by p->on_cpu. However: + * + * p->XXX = X; ttwu() + * schedule() if (p->on_rq && ..) // false + * smp_mb__after_spinlock(); if (smp_load_acquire(&p->on_cpu) && //true + * deactivate_task() ttwu_queue_wakelist()) + * p->on_rq = 0; p->sched_remote_wakeup = Y; + * + * guarantees all stores of 'current' are visible before + * ->sched_remote_wakeup gets used, so it can be in this word. + */ + unsigned sched_remote_wakeup:1; +#ifdef CONFIG_RT_MUTEXES + unsigned sched_rt_mutex:1; #endif - struct mm_struct *mm, *active_mm; + /* Bit to tell TOMOYO we're in execve(): */ + unsigned in_execve:1; + unsigned in_iowait:1; +#ifndef TIF_RESTORE_SIGMASK + unsigned restore_sigmask:1; +#endif +#ifdef CONFIG_MEMCG_V1 + unsigned in_user_fault:1; +#endif +#ifdef CONFIG_LRU_GEN + /* whether the LRU algorithm may apply to this access */ + unsigned in_lru_fault:1; +#endif #ifdef CONFIG_COMPAT_BRK - unsigned brk_randomized:1; + unsigned brk_randomized:1; #endif -#if defined(SPLIT_RSS_COUNTING) - struct task_rss_stat rss_stat; +#ifdef CONFIG_CGROUPS + /* disallow userland-initiated cgroup migration */ + unsigned no_cgroup_migration:1; + /* task is frozen/stopped (used by the cgroup freezer) */ + unsigned frozen:1; #endif -/* task state */ - int exit_state; - int exit_code, exit_signal; - int pdeath_signal; /* The signal sent when the parent dies */ - unsigned int jobctl; /* JOBCTL_*, siglock protected */ - - /* Used for emulating ABI behavior of previous Linux versions */ - unsigned int personality; - - unsigned did_exec:1; - unsigned in_execve:1; /* Tell the LSMs that the process is doing an - * execve */ - unsigned in_iowait:1; - - /* task may not gain privileges */ - unsigned no_new_privs:1; +#ifdef CONFIG_BLK_CGROUP + unsigned use_memdelay:1; +#endif +#ifdef CONFIG_PSI + /* Stalled due to lack of memory */ + unsigned in_memstall:1; +#endif +#ifdef CONFIG_PAGE_OWNER + /* Used by page_owner=on to detect recursion in page tracking. */ + unsigned in_page_owner:1; +#endif +#ifdef CONFIG_EVENTFD + /* Recursion prevention for eventfd_signal() */ + unsigned in_eventfd:1; +#endif +#ifdef CONFIG_ARCH_HAS_CPU_PASID + unsigned pasid_activated:1; +#endif +#ifdef CONFIG_X86_BUS_LOCK_DETECT + unsigned reported_split_lock:1; +#endif +#ifdef CONFIG_TASK_DELAY_ACCT + /* delay due to memory thrashing */ + unsigned in_thrashing:1; +#endif + unsigned in_nf_duplicate:1; +#ifdef CONFIG_PREEMPT_RT + struct netdev_xmit net_xmit; +#endif + unsigned long atomic_flags; /* Flags requiring atomic access. */ - /* Revert to default priority/policy when forking */ - unsigned sched_reset_on_fork:1; - unsigned sched_contributes_to_load:1; + struct restart_block restart_block; - pid_t pid; - pid_t tgid; + pid_t pid; + pid_t tgid; -#ifdef CONFIG_CC_STACKPROTECTOR - /* Canary value for the -fstack-protector gcc feature */ - unsigned long stack_canary; +#ifdef CONFIG_STACKPROTECTOR + /* Canary value for the -fstack-protector GCC feature: */ + unsigned long stack_canary; #endif /* - * pointers to (original) parent process, youngest child, younger sibling, + * Pointers to the (original) parent process, youngest child, younger sibling, * older sibling, respectively. (p->father can be replaced with * p->real_parent->pid) */ - struct task_struct __rcu *real_parent; /* real parent process */ - struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */ + + /* Real parent process: */ + struct task_struct __rcu *real_parent; + + /* Recipient of SIGCHLD, wait4() reports: */ + struct task_struct __rcu *parent; + /* - * children/sibling forms the list of my natural children + * Children/sibling form the list of natural children: */ - struct list_head children; /* list of my children */ - struct list_head sibling; /* linkage in my parent's children list */ - struct task_struct *group_leader; /* threadgroup leader */ + struct list_head children; + struct list_head sibling; + struct task_struct *group_leader; /* - * ptraced is the list of tasks this task is using ptrace on. + * 'ptraced' is the list of tasks this task is using ptrace() on. + * * This includes both natural children and PTRACE_ATTACH targets. - * p->ptrace_entry is p's link on the p->parent->ptraced list. + * 'ptrace_entry' is this task's link on the p->parent->ptraced list. */ - struct list_head ptraced; - struct list_head ptrace_entry; + struct list_head ptraced; + struct list_head ptrace_entry; /* PID/PID hash table linkage. */ - struct pid_link pids[PIDTYPE_MAX]; - struct list_head thread_group; + struct pid *thread_pid; + struct hlist_node pid_links[PIDTYPE_MAX]; + struct list_head thread_node; - struct completion *vfork_done; /* for vfork() */ - int __user *set_child_tid; /* CLONE_CHILD_SETTID */ - int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ + struct completion *vfork_done; - cputime_t utime, stime, utimescaled, stimescaled; - cputime_t gtime; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - struct cputime prev_cputime; + /* CLONE_CHILD_SETTID: */ + int __user *set_child_tid; + + /* CLONE_CHILD_CLEARTID: */ + int __user *clear_child_tid; + + /* PF_KTHREAD | PF_IO_WORKER */ + void *worker_private; + + u64 utime; + u64 stime; +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME + u64 utimescaled; + u64 stimescaled; #endif + u64 gtime; + struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN - seqlock_t vtime_seqlock; - unsigned long long vtime_snap; - enum { - VTIME_SLEEPING = 0, - VTIME_USER, - VTIME_SYS, - } vtime_snap_whence; -#endif - unsigned long nvcsw, nivcsw; /* context switch counts */ - struct timespec start_time; /* monotonic time */ - struct timespec real_start_time; /* boot based time */ -/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ - unsigned long min_flt, maj_flt; - - struct task_cputime cputime_expires; - struct list_head cpu_timers[3]; - -/* process credentials */ - const struct cred __rcu *real_cred; /* objective and real subjective task - * credentials (COW) */ - const struct cred __rcu *cred; /* effective (overridable) subjective task - * credentials (COW) */ - char comm[TASK_COMM_LEN]; /* executable name excluding path - - access with [gs]et_task_comm (which lock - it with task_lock()) - - initialized normally by setup_new_exec */ -/* file system info */ - int link_count, total_link_count; + struct vtime vtime; +#endif + +#ifdef CONFIG_NO_HZ_FULL + atomic_t tick_dep_mask; +#endif + /* Context switch counts: */ + unsigned long nvcsw; + unsigned long nivcsw; + + /* Monotonic time in nsecs: */ + u64 start_time; + + /* Boot based time in nsecs: */ + u64 start_boottime; + + /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */ + unsigned long min_flt; + unsigned long maj_flt; + + /* Empty if CONFIG_POSIX_CPUTIMERS=n */ + struct posix_cputimers posix_cputimers; + +#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK + struct posix_cputimers_work posix_cputimers_work; +#endif + + /* Process credentials: */ + + /* Tracer's credentials at attach: */ + const struct cred __rcu *ptracer_cred; + + /* Objective and real subjective task credentials (COW): */ + const struct cred __rcu *real_cred; + + /* Effective (overridable) subjective task credentials (COW): */ + const struct cred __rcu *cred; + +#ifdef CONFIG_KEYS + /* Cached requested key. */ + struct key *cached_requested_key; +#endif + + /* + * executable name, excluding path. + * + * - normally initialized begin_new_exec() + * - set it with set_task_comm() + * - strscpy_pad() to ensure it is always NUL-terminated and + * zero-padded + * - task_lock() to ensure the operation is atomic and the name is + * fully updated. + */ + char comm[TASK_COMM_LEN]; + + struct nameidata *nameidata; + #ifdef CONFIG_SYSVIPC -/* ipc stuff */ - struct sysv_sem sysvsem; + struct sysv_sem sysvsem; + struct sysv_shm sysvshm; #endif #ifdef CONFIG_DETECT_HUNG_TASK -/* hung task detection */ - unsigned long last_switch_count; -#endif -/* CPU-specific state of this task */ - struct thread_struct thread; -/* filesystem information */ - struct fs_struct *fs; -/* open file information */ - struct files_struct *files; -/* namespaces */ - struct nsproxy *nsproxy; -/* signal handlers */ - struct signal_struct *signal; - struct sighand_struct *sighand; - - sigset_t blocked, real_blocked; - sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */ - struct sigpending pending; - - unsigned long sas_ss_sp; - size_t sas_ss_size; - int (*notifier)(void *priv); - void *notifier_data; - sigset_t *notifier_mask; - struct callback_head *task_works; - - struct audit_context *audit_context; + unsigned long last_switch_count; + unsigned long last_switch_time; +#endif + /* Filesystem information: */ + struct fs_struct *fs; + + /* Open file information: */ + struct files_struct *files; + +#ifdef CONFIG_IO_URING + struct io_uring_task *io_uring; +#endif + + /* Namespaces: */ + struct nsproxy *nsproxy; + + /* Signal handlers: */ + struct signal_struct *signal; + struct sighand_struct __rcu *sighand; + sigset_t blocked; + sigset_t real_blocked; + /* Restored if set_restore_sigmask() was used: */ + sigset_t saved_sigmask; + struct sigpending pending; + unsigned long sas_ss_sp; + size_t sas_ss_size; + unsigned int sas_ss_flags; + + struct callback_head *task_works; + +#ifdef CONFIG_AUDIT #ifdef CONFIG_AUDITSYSCALL - kuid_t loginuid; - unsigned int sessionid; + struct audit_context *audit_context; #endif - struct seccomp seccomp; + kuid_t loginuid; + unsigned int sessionid; +#endif + struct seccomp seccomp; + struct syscall_user_dispatch syscall_dispatch; + + /* Thread group tracking: */ + u64 parent_exec_id; + u64 self_exec_id; -/* Thread group tracking */ - u32 parent_exec_id; - u32 self_exec_id; -/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, - * mempolicy */ - spinlock_t alloc_lock; + /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */ + spinlock_t alloc_lock; /* Protection of the PI data structures: */ - raw_spinlock_t pi_lock; + raw_spinlock_t pi_lock; + + struct wake_q_node wake_q; #ifdef CONFIG_RT_MUTEXES - /* PI waiters blocked on a rt_mutex held by this task */ - struct plist_head pi_waiters; - /* Deadlock detection and priority inheritance handling */ - struct rt_mutex_waiter *pi_blocked_on; + /* PI waiters blocked on a rt_mutex held by this task: */ + struct rb_root_cached pi_waiters; + /* Updated under owner's pi_lock and rq lock */ + struct task_struct *pi_top_task; + /* Deadlock detection and priority inheritance handling: */ + struct rt_mutex_waiter *pi_blocked_on; #endif -#ifdef CONFIG_DEBUG_MUTEXES - /* mutex deadlock detection */ - struct mutex_waiter *blocked_on; + struct mutex *blocked_on; /* lock we're blocked on */ + +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER + /* + * Encoded lock address causing task block (lower 2 bits = type from + * <linux/hung_task.h>). Accessed via hung_task_*() helpers. + */ + unsigned long blocker; +#endif + +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP + int non_block_count; #endif + #ifdef CONFIG_TRACE_IRQFLAGS - unsigned int irq_events; - unsigned long hardirq_enable_ip; - unsigned long hardirq_disable_ip; - unsigned int hardirq_enable_event; - unsigned int hardirq_disable_event; - int hardirqs_enabled; - int hardirq_context; - unsigned long softirq_disable_ip; - unsigned long softirq_enable_ip; - unsigned int softirq_disable_event; - unsigned int softirq_enable_event; - int softirqs_enabled; - int softirq_context; + struct irqtrace_events irqtrace; + unsigned int hardirq_threaded; + u64 hardirq_chain_key; + int softirqs_enabled; + int softirq_context; + int irq_config; #endif +#ifdef CONFIG_PREEMPT_RT + int softirq_disable_cnt; +#endif + #ifdef CONFIG_LOCKDEP -# define MAX_LOCK_DEPTH 48UL - u64 curr_chain_key; - int lockdep_depth; - unsigned int lockdep_recursion; - struct held_lock held_locks[MAX_LOCK_DEPTH]; - gfp_t lockdep_reclaim_gfp; +# define MAX_LOCK_DEPTH 48UL + u64 curr_chain_key; + int lockdep_depth; + unsigned int lockdep_recursion; + struct held_lock held_locks[MAX_LOCK_DEPTH]; +#endif + +#if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP) + unsigned int in_ubsan; #endif -/* journalling filesystem info */ - void *journal_info; + /* Journalling filesystem info: */ + void *journal_info; -/* stacked block device info */ - struct bio_list *bio_list; + /* Stacked block device info: */ + struct bio_list *bio_list; -#ifdef CONFIG_BLOCK -/* stack plugging */ - struct blk_plug *plug; -#endif + /* Stack plugging: */ + struct blk_plug *plug; -/* VM state */ - struct reclaim_state *reclaim_state; + /* VM state: */ + struct reclaim_state *reclaim_state; - struct backing_dev_info *backing_dev_info; + struct io_context *io_context; - struct io_context *io_context; +#ifdef CONFIG_COMPACTION + struct capture_control *capture_control; +#endif + /* Ptrace state: */ + unsigned long ptrace_message; + kernel_siginfo_t *last_siginfo; - unsigned long ptrace_message; - siginfo_t *last_siginfo; /* For ptrace use. */ - struct task_io_accounting ioac; -#if defined(CONFIG_TASK_XACCT) - u64 acct_rss_mem1; /* accumulated rss usage */ - u64 acct_vm_mem1; /* accumulated virtual memory usage */ - cputime_t acct_timexpd; /* stime + utime since last update */ + struct task_io_accounting ioac; +#ifdef CONFIG_PSI + /* Pressure stall state */ + unsigned int psi_flags; +#endif +#ifdef CONFIG_TASK_XACCT + /* Accumulated RSS usage: */ + u64 acct_rss_mem1; + /* Accumulated virtual memory usage: */ + u64 acct_vm_mem1; + /* stime + utime since last update: */ + u64 acct_timexpd; #endif #ifdef CONFIG_CPUSETS - nodemask_t mems_allowed; /* Protected by alloc_lock */ - seqcount_t mems_allowed_seq; /* Seqence no to catch updates */ - int cpuset_mem_spread_rotor; - int cpuset_slab_spread_rotor; + /* Protected by ->alloc_lock: */ + nodemask_t mems_allowed; + /* Sequence number to catch updates: */ + seqcount_spinlock_t mems_allowed_seq; + int cpuset_mem_spread_rotor; #endif #ifdef CONFIG_CGROUPS - /* Control Group info protected by css_set_lock */ - struct css_set __rcu *cgroups; - /* cg_list protected by css_set_lock and tsk->alloc_lock */ - struct list_head cg_list; + /* Control Group info protected by css_set_lock: */ + struct css_set __rcu *cgroups; + /* cg_list protected by css_set_lock and tsk->alloc_lock: */ + struct list_head cg_list; +#ifdef CONFIG_PREEMPT_RT + struct llist_node cg_dead_lnode; +#endif /* CONFIG_PREEMPT_RT */ +#endif /* CONFIG_CGROUPS */ +#ifdef CONFIG_X86_CPU_RESCTRL + u32 closid; + u32 rmid; #endif #ifdef CONFIG_FUTEX - struct robust_list_head __user *robust_list; + struct robust_list_head __user *robust_list; #ifdef CONFIG_COMPAT struct compat_robust_list_head __user *compat_robust_list; #endif - struct list_head pi_state_list; - struct futex_pi_state *pi_state_cache; + struct list_head pi_state_list; + struct futex_pi_state *pi_state_cache; + struct mutex futex_exit_mutex; + unsigned int futex_state; #endif #ifdef CONFIG_PERF_EVENTS - struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts]; - struct mutex perf_event_mutex; - struct list_head perf_event_list; + u8 perf_recursion[PERF_NR_CONTEXTS]; + struct perf_event_context *perf_event_ctxp; + struct mutex perf_event_mutex; + struct list_head perf_event_list; + struct perf_ctx_data __rcu *perf_ctx_data; +#endif +#ifdef CONFIG_DEBUG_PREEMPT + unsigned long preempt_disable_ip; #endif #ifdef CONFIG_NUMA - struct mempolicy *mempolicy; /* Protected by alloc_lock */ - short il_next; - short pref_node_fork; + /* Protected by alloc_lock: */ + struct mempolicy *mempolicy; + short il_prev; + u8 il_weight; + short pref_node_fork; #endif #ifdef CONFIG_NUMA_BALANCING - int numa_scan_seq; - int numa_migrate_seq; - unsigned int numa_scan_period; - u64 node_stamp; /* migration stamp */ - struct callback_head numa_work; -#endif /* CONFIG_NUMA_BALANCING */ + int numa_scan_seq; + unsigned int numa_scan_period; + unsigned int numa_scan_period_max; + int numa_preferred_nid; + unsigned long numa_migrate_retry; + /* Migration stamp: */ + u64 node_stamp; + u64 last_task_numa_placement; + u64 last_sum_exec_runtime; + struct callback_head numa_work; - struct rcu_head rcu; + /* + * This pointer is only modified for current in syscall and + * pagefault context (and for tasks being destroyed), so it can be read + * from any of the following contexts: + * - RCU read-side critical section + * - current->numa_group from everywhere + * - task's runqueue locked, task not running + */ + struct numa_group __rcu *numa_group; /* - * cache last used pipe for splice + * numa_faults is an array split into four regions: + * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer + * in this precise order. + * + * faults_memory: Exponential decaying average of faults on a per-node + * basis. Scheduling placement decisions are made based on these + * counts. The values remain static for the duration of a PTE scan. + * faults_cpu: Track the nodes the process was running on when a NUMA + * hinting fault was incurred. + * faults_memory_buffer and faults_cpu_buffer: Record faults per node + * during the current scan window. When the scan completes, the counts + * in faults_memory and faults_cpu decay and these values are copied. */ - struct pipe_inode_info *splice_pipe; + unsigned long *numa_faults; + unsigned long total_numa_faults; + + /* + * numa_faults_locality tracks if faults recorded during the last + * scan window were remote/local or failed to migrate. The task scan + * period is adapted based on the locality of the faults with different + * weights depending on whether they were shared or private faults + */ + unsigned long numa_faults_locality[3]; + + unsigned long numa_pages_migrated; +#endif /* CONFIG_NUMA_BALANCING */ + + struct rseq_data rseq; + struct sched_mm_cid mm_cid; + + struct tlbflush_unmap_batch tlb_ubc; + + /* Cache last used pipe for splice(): */ + struct pipe_inode_info *splice_pipe; - struct page_frag task_frag; + struct page_frag task_frag; -#ifdef CONFIG_TASK_DELAY_ACCT - struct task_delay_info *delays; +#ifdef CONFIG_TASK_DELAY_ACCT + struct task_delay_info *delays; #endif + #ifdef CONFIG_FAULT_INJECTION - int make_it_fail; + int make_it_fail; + unsigned int fail_nth; #endif /* - * when (nr_dirtied >= nr_dirtied_pause), it's time to call - * balance_dirty_pages() for some dirty throttling pause + * When (nr_dirtied >= nr_dirtied_pause), it's time to call + * balance_dirty_pages() for a dirty throttling pause: */ - int nr_dirtied; - int nr_dirtied_pause; - unsigned long dirty_paused_when; /* start of a write-and-pause period */ + int nr_dirtied; + int nr_dirtied_pause; + /* Start of a write-and-pause period: */ + unsigned long dirty_paused_when; #ifdef CONFIG_LATENCYTOP - int latency_record_count; - struct latency_record latency_record[LT_SAVECOUNT]; + int latency_record_count; + struct latency_record latency_record[LT_SAVECOUNT]; #endif /* - * time slack values; these are used to round up poll() and + * Time slack values; these are used to round up poll() and * select() etc timeout values. These are in nanoseconds. */ - unsigned long timer_slack_ns; - unsigned long default_timer_slack_ns; + u64 timer_slack_ns; + u64 default_timer_slack_ns; + +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) + unsigned int kasan_depth; +#endif + +#ifdef CONFIG_KCSAN + struct kcsan_ctx kcsan_ctx; +#ifdef CONFIG_TRACE_IRQFLAGS + struct irqtrace_events kcsan_save_irqtrace; +#endif +#ifdef CONFIG_KCSAN_WEAK_MEMORY + int kcsan_stack_depth; +#endif +#endif + +#ifdef CONFIG_KMSAN + struct kmsan_ctx kmsan_ctx; +#endif + +#if IS_ENABLED(CONFIG_KUNIT) + struct kunit *kunit_test; +#endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER - /* Index of current stored address in ret_stack */ - int curr_ret_stack; - /* Stack of return addresses for return function tracing */ - struct ftrace_ret_stack *ret_stack; - /* time stamp for last schedule */ - unsigned long long ftrace_timestamp; + /* Index of current stored address in ret_stack: */ + int curr_ret_stack; + int curr_ret_depth; + + /* Stack of return addresses for return function tracing: */ + unsigned long *ret_stack; + + /* Timestamp for last schedule: */ + unsigned long long ftrace_timestamp; + unsigned long long ftrace_sleeptime; + /* * Number of functions that haven't been traced - * because of depth overrun. + * because of depth overrun: */ - atomic_t trace_overrun; - /* Pause for the tracing */ - atomic_t tracing_graph_pause; + atomic_t trace_overrun; + + /* Pause tracing: */ + atomic_t tracing_graph_pause; #endif + #ifdef CONFIG_TRACING - /* state flags for use by tracers */ - unsigned long trace; - /* bitmask and counter of trace recursion */ - unsigned long trace_recursion; + /* Bitmask and counter of trace recursion: */ + unsigned long trace_recursion; #endif /* CONFIG_TRACING */ -#ifdef CONFIG_MEMCG /* memcg uses this to do batch job */ - struct memcg_batch_info { - int do_batch; /* incremented when batch uncharge started */ - struct mem_cgroup *memcg; /* target memcg of uncharge */ - unsigned long nr_pages; /* uncharged usage */ - unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ - } memcg_batch; - unsigned int memcg_kmem_skip_account; -#endif -#ifdef CONFIG_UPROBES - struct uprobe_task *utask; -#endif -#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE) - unsigned int sequential_io; - unsigned int sequential_io_avg; -#endif -}; -/* Future-safe accessor for struct task_struct's cpus_allowed. */ -#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) +#ifdef CONFIG_KCOV + /* See kernel/kcov.c for more details. */ -#ifdef CONFIG_NUMA_BALANCING -extern void task_numa_fault(int node, int pages, bool migrated); -extern void set_numabalancing_state(bool enabled); -#else -static inline void task_numa_fault(int node, int pages, bool migrated) -{ -} -static inline void set_numabalancing_state(bool enabled) -{ -} -#endif + /* Coverage collection mode enabled for this task (0 if disabled): */ + unsigned int kcov_mode; -static inline struct pid *task_pid(struct task_struct *task) -{ - return task->pids[PIDTYPE_PID].pid; -} + /* Size of the kcov_area: */ + unsigned int kcov_size; -static inline struct pid *task_tgid(struct task_struct *task) -{ - return task->group_leader->pids[PIDTYPE_PID].pid; -} + /* Buffer for coverage collection: */ + void *kcov_area; -/* - * Without tasklist or rcu lock it is not safe to dereference - * the result of task_pgrp/task_session even if task == current, - * we can race with another thread doing sys_setsid/sys_setpgid. - */ -static inline struct pid *task_pgrp(struct task_struct *task) -{ - return task->group_leader->pids[PIDTYPE_PGID].pid; -} + /* KCOV descriptor wired with this task or NULL: */ + struct kcov *kcov; -static inline struct pid *task_session(struct task_struct *task) -{ - return task->group_leader->pids[PIDTYPE_SID].pid; -} + /* KCOV common handle for remote coverage collection: */ + u64 kcov_handle; -struct pid_namespace; + /* KCOV sequence number: */ + int kcov_sequence; -/* - * the helpers to get the task's different pids as they are seen - * from various namespaces - * - * task_xid_nr() : global id, i.e. the id seen from the init namespace; - * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of - * current. - * task_xid_nr_ns() : id seen from the ns specified; - * - * set_task_vxid() : assigns a virtual id to a task; - * - * see also pid_nr() etc in include/linux/pid.h - */ -pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, - struct pid_namespace *ns); + /* Collect coverage from softirq context: */ + unsigned int kcov_softirq; +#endif -static inline pid_t task_pid_nr(struct task_struct *tsk) -{ - return tsk->pid; -} +#ifdef CONFIG_MEMCG_V1 + struct mem_cgroup *memcg_in_oom; +#endif -static inline pid_t task_pid_nr_ns(struct task_struct *tsk, - struct pid_namespace *ns) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns); -} +#ifdef CONFIG_MEMCG + /* Number of pages to reclaim on returning to userland: */ + unsigned int memcg_nr_pages_over_high; -static inline pid_t task_pid_vnr(struct task_struct *tsk) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); -} + /* Used by memcontrol for targeted memcg charge: */ + struct mem_cgroup *active_memcg; + /* Cache for current->cgroups->memcg->objcg lookups: */ + struct obj_cgroup *objcg; +#endif -static inline pid_t task_tgid_nr(struct task_struct *tsk) -{ - return tsk->tgid; -} +#ifdef CONFIG_BLK_CGROUP + struct gendisk *throttle_disk; +#endif -pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); +#ifdef CONFIG_UPROBES + struct uprobe_task *utask; +#endif +#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE) + unsigned int sequential_io; + unsigned int sequential_io_avg; +#endif + struct kmap_ctrl kmap_ctrl; +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP + unsigned long task_state_change; +# ifdef CONFIG_PREEMPT_RT + unsigned long saved_state_change; +# endif +#endif + struct rcu_head rcu; + refcount_t rcu_users; + int pagefault_disabled; +#ifdef CONFIG_MMU + struct task_struct *oom_reaper_list; + struct timer_list oom_reaper_timer; +#endif +#ifdef CONFIG_VMAP_STACK + struct vm_struct *stack_vm_area; +#endif +#ifdef CONFIG_THREAD_INFO_IN_TASK + /* A live task holds one reference: */ + refcount_t stack_refcount; +#endif +#ifdef CONFIG_LIVEPATCH + int patch_state; +#endif +#ifdef CONFIG_SECURITY + /* Used by LSM modules for access restriction: */ + void *security; +#endif +#ifdef CONFIG_BPF_SYSCALL + /* Used by BPF task local storage */ + struct bpf_local_storage __rcu *bpf_storage; + /* Used for BPF run context */ + struct bpf_run_ctx *bpf_ctx; +#endif + /* Used by BPF for per-TASK xdp storage */ + struct bpf_net_context *bpf_net_context; -static inline pid_t task_tgid_vnr(struct task_struct *tsk) -{ - return pid_vnr(task_tgid(tsk)); -} +#ifdef CONFIG_KSTACK_ERASE + unsigned long lowest_stack; +#endif +#ifdef CONFIG_KSTACK_ERASE_METRICS + unsigned long prev_lowest_stack; +#endif +#ifdef CONFIG_X86_MCE + void __user *mce_vaddr; + __u64 mce_kflags; + u64 mce_addr; + __u64 mce_ripv : 1, + mce_whole_page : 1, + __mce_reserved : 62; + struct callback_head mce_kill_me; + int mce_count; +#endif -static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, - struct pid_namespace *ns) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); -} +#ifdef CONFIG_KRETPROBES + struct llist_head kretprobe_instances; +#endif +#ifdef CONFIG_RETHOOK + struct llist_head rethooks; +#endif -static inline pid_t task_pgrp_vnr(struct task_struct *tsk) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL); -} +#ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH + /* + * If L1D flush is supported on mm context switch + * then we use this callback head to queue kill work + * to kill tasks that are not running on SMT disabled + * cores + */ + struct callback_head l1d_flush_kill; +#endif +#ifdef CONFIG_RV + /* + * Per-task RV monitor, fixed in CONFIG_RV_PER_TASK_MONITORS. + * If memory becomes a concern, we can think about a dynamic method. + */ + union rv_task_monitor rv[CONFIG_RV_PER_TASK_MONITORS]; +#endif -static inline pid_t task_session_nr_ns(struct task_struct *tsk, - struct pid_namespace *ns) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns); -} +#ifdef CONFIG_USER_EVENTS + struct user_event_mm *user_event_mm; +#endif -static inline pid_t task_session_vnr(struct task_struct *tsk) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); -} +#ifdef CONFIG_UNWIND_USER + struct unwind_task_info unwind_info; +#endif + + /* CPU-specific state of this task: */ + struct thread_struct thread; + + /* + * New fields for task_struct should be added above here, so that + * they are included in the randomized portion of task_struct. + */ + randomized_struct_fields_end +} __attribute__ ((aligned (64))); -/* obsolete, do not use */ -static inline pid_t task_pgrp_nr(struct task_struct *tsk) +#ifdef CONFIG_SCHED_PROXY_EXEC +DECLARE_STATIC_KEY_TRUE(__sched_proxy_exec); +static inline bool sched_proxy_exec(void) { - return task_pgrp_nr_ns(tsk, &init_pid_ns); + return static_branch_likely(&__sched_proxy_exec); } - -/** - * pid_alive - check that a task structure is not stale - * @p: Task structure to be checked. - * - * Test if a process is not yet dead (at most zombie state) - * If pid_alive fails, then pointers within the task structure - * can be stale and must not be dereferenced. - */ -static inline int pid_alive(struct task_struct *p) +#else +static inline bool sched_proxy_exec(void) { - return p->pids[PIDTYPE_PID].pid != NULL; + return false; } +#endif -/** - * is_global_init - check if a task structure is init - * @tsk: Task structure to be checked. - * - * Check if a task structure is the first user space task the kernel created. - */ -static inline int is_global_init(struct task_struct *tsk) +#define TASK_REPORT_IDLE (TASK_REPORT + 1) +#define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) + +static inline unsigned int __task_state_index(unsigned int tsk_state, + unsigned int tsk_exit_state) { - return tsk->pid == 1; -} + unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT; -extern struct pid *cad_pid; + BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); -extern void free_task(struct task_struct *tsk); -#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) + if ((tsk_state & TASK_IDLE) == TASK_IDLE) + state = TASK_REPORT_IDLE; -extern void __put_task_struct(struct task_struct *t); + /* + * We're lying here, but rather than expose a completely new task state + * to userspace, we can make this appear as if the task has gone through + * a regular rt_mutex_lock() call. + * Report frozen tasks as uninterruptible. + */ + if ((tsk_state & TASK_RTLOCK_WAIT) || (tsk_state & TASK_FROZEN)) + state = TASK_UNINTERRUPTIBLE; -static inline void put_task_struct(struct task_struct *t) -{ - if (atomic_dec_and_test(&t->usage)) - __put_task_struct(t); + return fls(state); } -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -extern void task_cputime(struct task_struct *t, - cputime_t *utime, cputime_t *stime); -extern void task_cputime_scaled(struct task_struct *t, - cputime_t *utimescaled, cputime_t *stimescaled); -extern cputime_t task_gtime(struct task_struct *t); -#else -static inline void task_cputime(struct task_struct *t, - cputime_t *utime, cputime_t *stime) +static inline unsigned int task_state_index(struct task_struct *tsk) { - if (utime) - *utime = t->utime; - if (stime) - *stime = t->stime; + return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state); } -static inline void task_cputime_scaled(struct task_struct *t, - cputime_t *utimescaled, - cputime_t *stimescaled) +static inline char task_index_to_char(unsigned int state) { - if (utimescaled) - *utimescaled = t->utimescaled; - if (stimescaled) - *stimescaled = t->stimescaled; + static const char state_char[] = "RSDTtXZPI"; + + BUILD_BUG_ON(TASK_REPORT_MAX * 2 != 1 << (sizeof(state_char) - 1)); + + return state_char[state]; } -static inline cputime_t task_gtime(struct task_struct *t) +static inline char task_state_to_char(struct task_struct *tsk) { - return t->gtime; + return task_index_to_char(task_state_index(tsk)); } -#endif -extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); -extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); + +extern struct pid *cad_pid; /* * Per process flags */ -#define PF_EXITING 0x00000004 /* getting shut down */ -#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ -#define PF_VCPU 0x00000010 /* I'm a virtual CPU */ -#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ -#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ -#define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */ -#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ -#define PF_DUMPCORE 0x00000200 /* dumped core */ -#define PF_SIGNALED 0x00000400 /* killed by a signal */ -#define PF_MEMALLOC 0x00000800 /* Allocating memory */ -#define PF_NPROC_EXCEEDED 0x00001000 /* set_user noticed that RLIMIT_NPROC was exceeded */ -#define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ -#define PF_USED_ASYNC 0x00004000 /* used async_schedule*(), used by module init */ -#define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ -#define PF_FROZEN 0x00010000 /* frozen for system suspend */ -#define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ -#define PF_KSWAPD 0x00040000 /* I am kswapd */ -#define PF_MEMALLOC_NOIO 0x00080000 /* Allocating memory without IO involved */ -#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ -#define PF_KTHREAD 0x00200000 /* I am a kernel thread */ -#define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ -#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ -#define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ -#define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ -#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ -#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ -#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ -#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ -#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ +#define PF_VCPU 0x00000001 /* I'm a virtual CPU */ +#define PF_IDLE 0x00000002 /* I am an IDLE thread */ +#define PF_EXITING 0x00000004 /* Getting shut down */ +#define PF_POSTCOREDUMP 0x00000008 /* Coredumps should ignore this task */ +#define PF_IO_WORKER 0x00000010 /* Task is an IO worker */ +#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ +#define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ +#define PF_MCE_PROCESS 0x00000080 /* Process policy on mce errors */ +#define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */ +#define PF_DUMPCORE 0x00000200 /* Dumped core */ +#define PF_SIGNALED 0x00000400 /* Killed by a signal */ +#define PF_MEMALLOC 0x00000800 /* Allocating memory to free memory. See memalloc_noreclaim_save() */ +#define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ +#define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ +#define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */ +#define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ +#define PF_KCOMPACTD 0x00010000 /* I am kcompactd */ +#define PF_KSWAPD 0x00020000 /* I am kswapd */ +#define PF_MEMALLOC_NOFS 0x00040000 /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */ +#define PF_MEMALLOC_NOIO 0x00080000 /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */ +#define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to, + * I am cleaning dirty pages from some other bdi. */ +#define PF_KTHREAD 0x00200000 /* I am a kernel thread */ +#define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ +#define PF__HOLE__00800000 0x00800000 +#define PF__HOLE__01000000 0x01000000 +#define PF__HOLE__02000000 0x02000000 +#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ +#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ +#define PF_MEMALLOC_PIN 0x10000000 /* Allocations constrained to zones which allow long term pinning. + * See memalloc_pin_save() */ +#define PF_BLOCK_TS 0x20000000 /* plug has ts that needs updating */ +#define PF__HOLE__40000000 0x40000000 +#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ /* * Only the _current_ task can read/write to tsk->flags, but other @@ -1640,295 +1753,173 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, * child is not running and in turn not changing child->flags * at the same time the parent does it. */ -#define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0) -#define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0) -#define clear_used_math() clear_stopped_child_used_math(current) -#define set_used_math() set_stopped_child_used_math(current) +#define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0) +#define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0) +#define clear_used_math() clear_stopped_child_used_math(current) +#define set_used_math() set_stopped_child_used_math(current) + #define conditional_stopped_child_used_math(condition, child) \ do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0) -#define conditional_used_math(condition) \ - conditional_stopped_child_used_math(condition, current) + +#define conditional_used_math(condition) conditional_stopped_child_used_math(condition, current) + #define copy_to_stopped_child_used_math(child) \ do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0) + /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */ -#define tsk_used_math(p) ((p)->flags & PF_USED_MATH) -#define used_math() tsk_used_math(current) +#define tsk_used_math(p) ((p)->flags & PF_USED_MATH) +#define used_math() tsk_used_math(current) -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags */ -static inline gfp_t memalloc_noio_flags(gfp_t flags) +static __always_inline bool is_percpu_thread(void) { - if (unlikely(current->flags & PF_MEMALLOC_NOIO)) - flags &= ~__GFP_IO; - return flags; + return (current->flags & PF_NO_SETAFFINITY) && + (current->nr_cpus_allowed == 1); } -static inline unsigned int memalloc_noio_save(void) -{ - unsigned int flags = current->flags & PF_MEMALLOC_NOIO; - current->flags |= PF_MEMALLOC_NOIO; - return flags; -} +/* Per-process atomic flags. */ +#define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ +#define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ +#define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */ +#define PFA_SPEC_SSB_DISABLE 3 /* Speculative Store Bypass disabled */ +#define PFA_SPEC_SSB_FORCE_DISABLE 4 /* Speculative Store Bypass force disabled*/ +#define PFA_SPEC_IB_DISABLE 5 /* Indirect branch speculation restricted */ +#define PFA_SPEC_IB_FORCE_DISABLE 6 /* Indirect branch speculation permanently restricted */ +#define PFA_SPEC_SSB_NOEXEC 7 /* Speculative Store Bypass clear on execve() */ -static inline void memalloc_noio_restore(unsigned int flags) -{ - current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags; -} +#define TASK_PFA_TEST(name, func) \ + static inline bool task_##func(struct task_struct *p) \ + { return test_bit(PFA_##name, &p->atomic_flags); } -/* - * task->jobctl flags - */ -#define JOBCTL_STOP_SIGMASK 0xffff /* signr of the last group stop */ - -#define JOBCTL_STOP_DEQUEUED_BIT 16 /* stop signal dequeued */ -#define JOBCTL_STOP_PENDING_BIT 17 /* task should stop for group stop */ -#define JOBCTL_STOP_CONSUME_BIT 18 /* consume group stop count */ -#define JOBCTL_TRAP_STOP_BIT 19 /* trap for STOP */ -#define JOBCTL_TRAP_NOTIFY_BIT 20 /* trap for NOTIFY */ -#define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ -#define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */ - -#define JOBCTL_STOP_DEQUEUED (1 << JOBCTL_STOP_DEQUEUED_BIT) -#define JOBCTL_STOP_PENDING (1 << JOBCTL_STOP_PENDING_BIT) -#define JOBCTL_STOP_CONSUME (1 << JOBCTL_STOP_CONSUME_BIT) -#define JOBCTL_TRAP_STOP (1 << JOBCTL_TRAP_STOP_BIT) -#define JOBCTL_TRAP_NOTIFY (1 << JOBCTL_TRAP_NOTIFY_BIT) -#define JOBCTL_TRAPPING (1 << JOBCTL_TRAPPING_BIT) -#define JOBCTL_LISTENING (1 << JOBCTL_LISTENING_BIT) - -#define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) -#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) - -extern bool task_set_jobctl_pending(struct task_struct *task, - unsigned int mask); -extern void task_clear_jobctl_trapping(struct task_struct *task); -extern void task_clear_jobctl_pending(struct task_struct *task, - unsigned int mask); +#define TASK_PFA_SET(name, func) \ + static inline void task_set_##func(struct task_struct *p) \ + { set_bit(PFA_##name, &p->atomic_flags); } -#ifdef CONFIG_PREEMPT_RCU +#define TASK_PFA_CLEAR(name, func) \ + static inline void task_clear_##func(struct task_struct *p) \ + { clear_bit(PFA_##name, &p->atomic_flags); } -#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ -#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */ +TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs) +TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs) -static inline void rcu_copy_process(struct task_struct *p) -{ - p->rcu_read_lock_nesting = 0; - p->rcu_read_unlock_special = 0; -#ifdef CONFIG_TREE_PREEMPT_RCU - p->rcu_blocked_node = NULL; -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -#ifdef CONFIG_RCU_BOOST - p->rcu_boost_mutex = NULL; -#endif /* #ifdef CONFIG_RCU_BOOST */ - INIT_LIST_HEAD(&p->rcu_node_entry); -} +TASK_PFA_TEST(SPREAD_PAGE, spread_page) +TASK_PFA_SET(SPREAD_PAGE, spread_page) +TASK_PFA_CLEAR(SPREAD_PAGE, spread_page) -#else +TASK_PFA_TEST(SPREAD_SLAB, spread_slab) +TASK_PFA_SET(SPREAD_SLAB, spread_slab) +TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab) -static inline void rcu_copy_process(struct task_struct *p) -{ -} +TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable) +TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable) +TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable) -#endif +TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec) +TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec) +TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec) -static inline void tsk_restore_flags(struct task_struct *task, - unsigned long orig_flags, unsigned long flags) -{ - task->flags &= ~flags; - task->flags |= orig_flags & flags; -} +TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) +TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) -#ifdef CONFIG_SMP -extern void do_set_cpus_allowed(struct task_struct *p, - const struct cpumask *new_mask); +TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable) +TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable) +TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable) -extern int set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask); -#else -static inline void do_set_cpus_allowed(struct task_struct *p, - const struct cpumask *new_mask) -{ -} -static inline int set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask) +TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) +TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) + +static inline void +current_restore_flags(unsigned long orig_flags, unsigned long flags) { - if (!cpumask_test_cpu(0, new_mask)) - return -EINVAL; - return 0; + current->flags &= ~flags; + current->flags |= orig_flags & flags; } -#endif -#ifdef CONFIG_NO_HZ_COMMON -void calc_load_enter_idle(void); -void calc_load_exit_idle(void); -#else -static inline void calc_load_enter_idle(void) { } -static inline void calc_load_exit_idle(void) { } -#endif /* CONFIG_NO_HZ_COMMON */ +extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); +extern int task_can_attach(struct task_struct *p); +extern int dl_bw_alloc(int cpu, u64 dl_bw); +extern void dl_bw_free(int cpu, u64 dl_bw); -#ifndef CONFIG_CPUMASK_OFFSTACK -static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) -{ - return set_cpus_allowed_ptr(p, &new_mask); -} -#endif +/* set_cpus_allowed_force() - consider using set_cpus_allowed_ptr() instead */ +extern void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask); -/* - * Do not use outside of architecture code which knows its limitations. - * - * sched_clock() has no promise of monotonicity or bounded drift between - * CPUs, use (which you should not) requires disabling IRQs. +/** + * set_cpus_allowed_ptr - set CPU affinity mask of a task + * @p: the task + * @new_mask: CPU affinity mask * - * Please use one of the three interfaces below. + * Return: zero if successful, or a negative error code */ -extern unsigned long long notrace sched_clock(void); -/* - * See the comment in kernel/sched/clock.c - */ -extern u64 cpu_clock(int cpu); -extern u64 local_clock(void); -extern u64 sched_clock_cpu(int cpu); - - -extern void sched_clock_init(void); - -#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -static inline void sched_clock_tick(void) -{ -} - -static inline void sched_clock_idle_sleep_event(void) -{ -} +extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); +extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node); +extern void release_user_cpus_ptr(struct task_struct *p); +extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask); +extern void force_compatible_cpus_allowed_ptr(struct task_struct *p); +extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p); + +extern int yield_to(struct task_struct *p, bool preempt); +extern void set_user_nice(struct task_struct *p, long nice); +extern int task_prio(const struct task_struct *p); -static inline void sched_clock_idle_wakeup_event(u64 delta_ns) +/** + * task_nice - return the nice value of a given task. + * @p: the task in question. + * + * Return: The nice value [ -20 ... 0 ... 19 ]. + */ +static inline int task_nice(const struct task_struct *p) { + return PRIO_TO_NICE((p)->static_prio); } -#else -/* - * Architectures can set this to 1 if they have specified - * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig, - * but then during bootup it turns out that sched_clock() - * is reliable after all: - */ -extern int sched_clock_stable; -extern void sched_clock_tick(void); -extern void sched_clock_idle_sleep_event(void); -extern void sched_clock_idle_wakeup_event(u64 delta_ns); -#endif - -#ifdef CONFIG_IRQ_TIME_ACCOUNTING -/* - * An i/f to runtime opt-in for irq time accounting based off of sched_clock. - * The reason for this explicit opt-in is not to have perf penalty with - * slow sched_clocks. - */ -extern void enable_sched_clock_irqtime(void); -extern void disable_sched_clock_irqtime(void); -#else -static inline void enable_sched_clock_irqtime(void) {} -static inline void disable_sched_clock_irqtime(void) {} -#endif - -extern unsigned long long -task_sched_runtime(struct task_struct *task); - -/* sched_exec is called by processes performing an exec */ -#ifdef CONFIG_SMP -extern void sched_exec(void); -#else -#define sched_exec() {} -#endif - -extern void sched_clock_idle_sleep_event(void); -extern void sched_clock_idle_wakeup_event(u64 delta_ns); - -#ifdef CONFIG_HOTPLUG_CPU -extern void idle_task_exit(void); -#else -static inline void idle_task_exit(void) {} -#endif - -#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) -extern void wake_up_nohz_cpu(int cpu); -#else -static inline void wake_up_nohz_cpu(int cpu) { } -#endif - -#ifdef CONFIG_NO_HZ_FULL -extern bool sched_can_stop_tick(void); -extern u64 scheduler_tick_max_deferment(void); -#else -static inline bool sched_can_stop_tick(void) { return false; } -#endif - -#ifdef CONFIG_SCHED_AUTOGROUP -extern void sched_autogroup_create_attach(struct task_struct *p); -extern void sched_autogroup_detach(struct task_struct *p); -extern void sched_autogroup_fork(struct signal_struct *sig); -extern void sched_autogroup_exit(struct signal_struct *sig); -#ifdef CONFIG_PROC_FS -extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m); -extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice); -#endif -#else -static inline void sched_autogroup_create_attach(struct task_struct *p) { } -static inline void sched_autogroup_detach(struct task_struct *p) { } -static inline void sched_autogroup_fork(struct signal_struct *sig) { } -static inline void sched_autogroup_exit(struct signal_struct *sig) { } -#endif - -extern bool yield_to(struct task_struct *p, bool preempt); -extern void set_user_nice(struct task_struct *p, long nice); -extern int task_prio(const struct task_struct *p); -extern int task_nice(const struct task_struct *p); extern int can_nice(const struct task_struct *p, const int nice); extern int task_curr(const struct task_struct *p); extern int idle_cpu(int cpu); -extern int sched_setscheduler(struct task_struct *, int, - const struct sched_param *); -extern int sched_setscheduler_nocheck(struct task_struct *, int, - const struct sched_param *); +extern int available_idle_cpu(int cpu); +extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); +extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); +extern void sched_set_fifo(struct task_struct *p); +extern void sched_set_fifo_low(struct task_struct *p); +extern void sched_set_fifo_secondary(struct task_struct *p); +extern void sched_set_normal(struct task_struct *p, int nice); +extern int sched_setattr(struct task_struct *, const struct sched_attr *); +extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); extern struct task_struct *idle_task(int cpu); + /** * is_idle_task - is the specified task an idle task? * @p: the task in question. + * + * Return: 1 if @p is an idle task. 0 otherwise. */ -static inline bool is_idle_task(const struct task_struct *p) +static __always_inline bool is_idle_task(const struct task_struct *p) { - return p->pid == 0; + return !!(p->flags & PF_IDLE); } + extern struct task_struct *curr_task(int cpu); -extern void set_curr_task(int cpu, struct task_struct *p); +extern void ia64_set_curr_task(int cpu, struct task_struct *p); void yield(void); -/* - * The default (Linux) execution domain. - */ -extern struct exec_domain default_exec_domain; - union thread_union { + struct task_struct task; +#ifndef CONFIG_THREAD_INFO_IN_TASK struct thread_info thread_info; +#endif unsigned long stack[THREAD_SIZE/sizeof(long)]; }; -#ifndef __HAVE_ARCH_KSTACK_END -static inline int kstack_end(void *addr) -{ - /* Reliable end of stack detection: - * Some APM bios versions misalign the stack - */ - return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*))); -} +#ifndef CONFIG_THREAD_INFO_IN_TASK +extern struct thread_info init_thread_info; #endif -extern union thread_union init_thread_union; -extern struct task_struct init_task; - -extern struct mm_struct init_mm; +extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)]; -extern struct pid_namespace init_pid_ns; +#ifdef CONFIG_THREAD_INFO_IN_TASK +# define task_thread_info(task) (&(task)->thread_info) +#else +# define task_thread_info(task) ((struct thread_info *)(task)->stack) +#endif /* * find a task by one of its numerical ids @@ -1942,385 +1933,59 @@ extern struct pid_namespace init_pid_ns; */ extern struct task_struct *find_task_by_vpid(pid_t nr); -extern struct task_struct *find_task_by_pid_ns(pid_t nr, - struct pid_namespace *ns); - -/* per-UID process charging. */ -extern struct user_struct * alloc_uid(kuid_t); -static inline struct user_struct *get_uid(struct user_struct *u) -{ - atomic_inc(&u->__count); - return u; -} -extern void free_uid(struct user_struct *); +extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns); -#include <asm/current.h> - -extern void xtime_update(unsigned long ticks); +/* + * find a task by its virtual pid and get the task struct + */ +extern struct task_struct *find_get_task_by_vpid(pid_t nr); extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); extern void wake_up_new_task(struct task_struct *tsk); -#ifdef CONFIG_SMP - extern void kick_process(struct task_struct *tsk); -#else - static inline void kick_process(struct task_struct *tsk) { } -#endif -extern void sched_fork(struct task_struct *p); -extern void sched_dead(struct task_struct *p); - -extern void proc_caches_init(void); -extern void flush_signals(struct task_struct *); -extern void __flush_signals(struct task_struct *); -extern void ignore_signals(struct task_struct *); -extern void flush_signal_handlers(struct task_struct *, int force_default); -extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info); - -static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) -{ - unsigned long flags; - int ret; - - spin_lock_irqsave(&tsk->sighand->siglock, flags); - ret = dequeue_signal(tsk, mask, info); - spin_unlock_irqrestore(&tsk->sighand->siglock, flags); - - return ret; -} - -extern void block_all_signals(int (*notifier)(void *priv), void *priv, - sigset_t *mask); -extern void unblock_all_signals(void); -extern void release_task(struct task_struct * p); -extern int send_sig_info(int, struct siginfo *, struct task_struct *); -extern int force_sigsegv(int, struct task_struct *); -extern int force_sig_info(int, struct siginfo *, struct task_struct *); -extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp); -extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid); -extern int kill_pid_info_as_cred(int, struct siginfo *, struct pid *, - const struct cred *, u32); -extern int kill_pgrp(struct pid *pid, int sig, int priv); -extern int kill_pid(struct pid *pid, int sig, int priv); -extern int kill_proc_info(int, struct siginfo *, pid_t); -extern __must_check bool do_notify_parent(struct task_struct *, int); -extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); -extern void force_sig(int, struct task_struct *); -extern int send_sig(int, struct task_struct *, int); -extern int zap_other_threads(struct task_struct *p); -extern struct sigqueue *sigqueue_alloc(void); -extern void sigqueue_free(struct sigqueue *); -extern int send_sigqueue(struct sigqueue *, struct task_struct *, int group); -extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); - -static inline void restore_saved_sigmask(void) -{ - if (test_and_clear_restore_sigmask()) - __set_current_blocked(¤t->saved_sigmask); -} - -static inline sigset_t *sigmask_to_save(void) -{ - sigset_t *res = ¤t->blocked; - if (unlikely(test_restore_sigmask())) - res = ¤t->saved_sigmask; - return res; -} - -static inline int kill_cad_pid(int sig, int priv) -{ - return kill_pid(cad_pid, sig, priv); -} - -/* These can be the second arg to send_sig_info/send_group_sig_info. */ -#define SEND_SIG_NOINFO ((struct siginfo *) 0) -#define SEND_SIG_PRIV ((struct siginfo *) 1) -#define SEND_SIG_FORCED ((struct siginfo *) 2) - -/* - * True if we are on the alternate signal stack. - */ -static inline int on_sig_stack(unsigned long sp) -{ -#ifdef CONFIG_STACK_GROWSUP - return sp >= current->sas_ss_sp && - sp - current->sas_ss_sp < current->sas_ss_size; -#else - return sp > current->sas_ss_sp && - sp - current->sas_ss_sp <= current->sas_ss_size; -#endif -} - -static inline int sas_ss_flags(unsigned long sp) -{ - return (current->sas_ss_size == 0 ? SS_DISABLE - : on_sig_stack(sp) ? SS_ONSTACK : 0); -} - -static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig) -{ - if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp)) -#ifdef CONFIG_STACK_GROWSUP - return current->sas_ss_sp; -#else - return current->sas_ss_sp + current->sas_ss_size; -#endif - return sp; -} -/* - * Routines for handling mm_structs - */ -extern struct mm_struct * mm_alloc(void); +extern void kick_process(struct task_struct *tsk); -/* mmdrop drops the mm and the page tables */ -extern void __mmdrop(struct mm_struct *); -static inline void mmdrop(struct mm_struct * mm) -{ - if (unlikely(atomic_dec_and_test(&mm->mm_count))) - __mmdrop(mm); -} - -/* mmput gets rid of the mappings and all user-space */ -extern void mmput(struct mm_struct *); -/* Grab a reference to a task's mm, if it is not already going away */ -extern struct mm_struct *get_task_mm(struct task_struct *task); -/* - * Grab a reference to a task's mm, if it is not already going away - * and ptrace_may_access with the mode parameter passed to it - * succeeds. - */ -extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); -/* Remove the current tasks stale references to the old mm_struct */ -extern void mm_release(struct task_struct *, struct mm_struct *); -/* Allocate a new mm structure and copy contents from tsk->mm */ -extern struct mm_struct *dup_mm(struct task_struct *tsk); - -extern int copy_thread(unsigned long, unsigned long, unsigned long, - struct task_struct *); -extern void flush_thread(void); -extern void exit_thread(void); - -extern void exit_files(struct task_struct *); -extern void __cleanup_sighand(struct sighand_struct *); - -extern void exit_itimers(struct signal_struct *); -extern void flush_itimer_signals(void); - -extern void do_group_exit(int); - -extern int allow_signal(int); -extern int disallow_signal(int); - -extern int do_execve(const char *, - const char __user * const __user *, - const char __user * const __user *); -extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); -struct task_struct *fork_idle(int); -extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); - -extern void set_task_comm(struct task_struct *tsk, char *from); -extern char *get_task_comm(char *to, struct task_struct *tsk); - -#ifdef CONFIG_SMP -void scheduler_ipi(void); -extern unsigned long wait_task_inactive(struct task_struct *, long match_state); -#else -static inline void scheduler_ipi(void) { } -static inline unsigned long wait_task_inactive(struct task_struct *p, - long match_state) -{ - return 1; -} -#endif - -#define next_task(p) \ - list_entry_rcu((p)->tasks.next, struct task_struct, tasks) - -#define for_each_process(p) \ - for (p = &init_task ; (p = next_task(p)) != &init_task ; ) - -extern bool current_is_single_threaded(void); - -/* - * Careful: do_each_thread/while_each_thread is a double loop so - * 'break' will not work as expected - use goto instead. - */ -#define do_each_thread(g, t) \ - for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do - -#define while_each_thread(g, t) \ - while ((t = next_thread(t)) != g) - -static inline int get_nr_threads(struct task_struct *tsk) -{ - return tsk->signal->nr_threads; -} - -static inline bool thread_group_leader(struct task_struct *p) -{ - return p->exit_signal >= 0; -} - -/* Do to the insanities of de_thread it is possible for a process - * to have the pid of the thread group leader without actually being - * the thread group leader. For iteration through the pids in proc - * all we care about is that we have a task with the appropriate - * pid, we don't actually care if we have the right task. - */ -static inline int has_group_leader_pid(struct task_struct *p) -{ - return p->pid == p->tgid; -} - -static inline -int same_thread_group(struct task_struct *p1, struct task_struct *p2) -{ - return p1->tgid == p2->tgid; -} - -static inline struct task_struct *next_thread(const struct task_struct *p) -{ - return list_entry_rcu(p->thread_group.next, - struct task_struct, thread_group); -} - -static inline int thread_group_empty(struct task_struct *p) -{ - return list_empty(&p->thread_group); -} - -#define delay_group_leader(p) \ - (thread_group_leader(p) && !thread_group_empty(p)) +extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); +#define set_task_comm(tsk, from) ({ \ + BUILD_BUG_ON(sizeof(from) != TASK_COMM_LEN); \ + __set_task_comm(tsk, from, false); \ +}) /* - * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring - * subscriptions and synchronises with wait4(). Also used in procfs. Also - * pins the final release of task.io_context. Also protects ->cpuset and - * ->cgroup.subsys[]. And ->vfork_done. + * - Why not use task_lock()? + * User space can randomly change their names anyway, so locking for readers + * doesn't make sense. For writers, locking is probably necessary, as a race + * condition could lead to long-term mixed results. + * The strscpy_pad() in __set_task_comm() can ensure that the task comm is + * always NUL-terminated and zero-padded. Therefore the race condition between + * reader and writer is not an issue. * - * Nests both inside and outside of read_lock(&tasklist_lock). - * It must not be nested with write_lock_irq(&tasklist_lock), - * neither inside nor outside. + * - BUILD_BUG_ON() can help prevent the buf from being truncated. + * Since the callers don't perform any return value checks, this safeguard is + * necessary. */ -static inline void task_lock(struct task_struct *p) -{ - spin_lock(&p->alloc_lock); -} - -static inline void task_unlock(struct task_struct *p) -{ - spin_unlock(&p->alloc_lock); -} - -extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, - unsigned long *flags); - -static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk, - unsigned long *flags) -{ - struct sighand_struct *ret; - - ret = __lock_task_sighand(tsk, flags); - (void)__cond_lock(&tsk->sighand->siglock, ret); - return ret; -} - -static inline void unlock_task_sighand(struct task_struct *tsk, - unsigned long *flags) -{ - spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); -} - -#ifdef CONFIG_CGROUPS -static inline void threadgroup_change_begin(struct task_struct *tsk) -{ - down_read(&tsk->signal->group_rwsem); -} -static inline void threadgroup_change_end(struct task_struct *tsk) -{ - up_read(&tsk->signal->group_rwsem); -} - -/** - * threadgroup_lock - lock threadgroup - * @tsk: member task of the threadgroup to lock - * - * Lock the threadgroup @tsk belongs to. No new task is allowed to enter - * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or - * change ->group_leader/pid. This is useful for cases where the threadgroup - * needs to stay stable across blockable operations. - * - * fork and exit paths explicitly call threadgroup_change_{begin|end}() for - * synchronization. While held, no new task will be added to threadgroup - * and no existing live task will have its PF_EXITING set. - * - * de_thread() does threadgroup_change_{begin|end}() when a non-leader - * sub-thread becomes a new leader. - */ -static inline void threadgroup_lock(struct task_struct *tsk) -{ - down_write(&tsk->signal->group_rwsem); -} - -/** - * threadgroup_unlock - unlock threadgroup - * @tsk: member task of the threadgroup to unlock - * - * Reverse threadgroup_lock(). - */ -static inline void threadgroup_unlock(struct task_struct *tsk) -{ - up_write(&tsk->signal->group_rwsem); -} -#else -static inline void threadgroup_change_begin(struct task_struct *tsk) {} -static inline void threadgroup_change_end(struct task_struct *tsk) {} -static inline void threadgroup_lock(struct task_struct *tsk) {} -static inline void threadgroup_unlock(struct task_struct *tsk) {} -#endif - -#ifndef __HAVE_THREAD_FUNCTIONS - -#define task_thread_info(task) ((struct thread_info *)(task)->stack) -#define task_stack_page(task) ((task)->stack) - -static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org) -{ - *task_thread_info(p) = *task_thread_info(org); - task_thread_info(p)->task = p; -} - -static inline unsigned long *end_of_stack(struct task_struct *p) -{ - return (unsigned long *)(task_thread_info(p) + 1); -} - -#endif +#define get_task_comm(buf, tsk) ({ \ + BUILD_BUG_ON(sizeof(buf) < TASK_COMM_LEN); \ + strscpy_pad(buf, (tsk)->comm); \ + buf; \ +}) -static inline int object_is_on_stack(void *obj) +static __always_inline void scheduler_ipi(void) { - void *stack = task_stack_page(current); - - return (obj >= stack) && (obj < (stack + THREAD_SIZE)); + /* + * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting + * TIF_NEED_RESCHED remotely (for the first time) will also send + * this IPI. + */ + preempt_fold_need_resched(); } -extern void thread_info_cache_init(void); - -#ifdef CONFIG_DEBUG_STACK_USAGE -static inline unsigned long stack_not_used(struct task_struct *p) -{ - unsigned long *n = end_of_stack(p); - - do { /* Skip over canary */ - n++; - } while (!*n); - - return (unsigned long)n - (unsigned long)end_of_stack(p); -} -#endif +extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); -/* set thread flags in other task's structures - * - see asm/thread_info.h for TIF_xxxx flags available +/* + * Set thread flags in other task's structures. + * See asm/thread_info.h for TIF_xxxx flags available: */ static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) { @@ -2332,6 +1997,12 @@ static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag) clear_ti_thread_flag(task_thread_info(tsk), flag); } +static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag, + bool value) +{ + update_ti_thread_flag(task_thread_info(tsk), flag, value); +} + static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); @@ -2349,12 +2020,16 @@ static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) static inline void set_tsk_need_resched(struct task_struct *tsk) { + if (tracepoint_enabled(sched_set_need_resched_tp) && + !test_tsk_thread_flag(tsk, TIF_NEED_RESCHED)) + __trace_set_need_resched(tsk, TIF_NEED_RESCHED); set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); } static inline void clear_tsk_need_resched(struct task_struct *tsk) { - clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); + atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY, + (atomic_long_t *)&task_thread_info(tsk)->flags); } static inline int test_tsk_need_resched(struct task_struct *tsk) @@ -2362,169 +2037,170 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } -static inline int restart_syscall(void) +static inline void set_need_resched_current(void) { - set_tsk_thread_flag(current, TIF_SIGPENDING); - return -ERESTARTNOINTR; + lockdep_assert_irqs_disabled(); + set_tsk_need_resched(current); + set_preempt_need_resched(); } -static inline int signal_pending(struct task_struct *p) +/* + * cond_resched() and cond_resched_lock(): latency reduction via + * explicit rescheduling in places that are safe. The return + * value indicates whether a reschedule was done in fact. + * cond_resched_lock() will drop the spinlock before scheduling, + */ +#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) +extern int __cond_resched(void); + +#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) + +DECLARE_STATIC_CALL(cond_resched, __cond_resched); + +static __always_inline int _cond_resched(void) { - return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); + return static_call_mod(cond_resched)(); } -static inline int __fatal_signal_pending(struct task_struct *p) +#elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) + +extern int dynamic_cond_resched(void); + +static __always_inline int _cond_resched(void) { - return unlikely(sigismember(&p->pending.signal, SIGKILL)); + return dynamic_cond_resched(); } -static inline int fatal_signal_pending(struct task_struct *p) +#else /* !CONFIG_PREEMPTION */ + +static inline int _cond_resched(void) { - return signal_pending(p) && __fatal_signal_pending(p); + return __cond_resched(); } -static inline int signal_pending_state(long state, struct task_struct *p) -{ - if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL))) - return 0; - if (!signal_pending(p)) - return 0; +#endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ - return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); -} +#else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */ -static inline int need_resched(void) +static inline int _cond_resched(void) { - return unlikely(test_thread_flag(TIF_NEED_RESCHED)); + return 0; } -/* - * cond_resched() and cond_resched_lock(): latency reduction via - * explicit rescheduling in places that are safe. The return - * value indicates whether a reschedule was done in fact. - * cond_resched_lock() will drop the spinlock before scheduling, - * cond_resched_softirq() will enable bhs before scheduling. - */ -extern int _cond_resched(void); +#endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */ #define cond_resched() ({ \ - __might_sleep(__FILE__, __LINE__, 0); \ + __might_resched(__FILE__, __LINE__, 0); \ _cond_resched(); \ }) extern int __cond_resched_lock(spinlock_t *lock); +extern int __cond_resched_rwlock_read(rwlock_t *lock); +extern int __cond_resched_rwlock_write(rwlock_t *lock); + +#define MIGHT_RESCHED_RCU_SHIFT 8 +#define MIGHT_RESCHED_PREEMPT_MASK ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1) -#ifdef CONFIG_PREEMPT_COUNT -#define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET +#ifndef CONFIG_PREEMPT_RT +/* + * Non RT kernels have an elevated preempt count due to the held lock, + * but are not allowed to be inside a RCU read side critical section + */ +# define PREEMPT_LOCK_RESCHED_OFFSETS PREEMPT_LOCK_OFFSET #else -#define PREEMPT_LOCK_OFFSET 0 +/* + * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in + * cond_resched*lock() has to take that into account because it checks for + * preempt_count() and rcu_preempt_depth(). + */ +# define PREEMPT_LOCK_RESCHED_OFFSETS \ + (PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT)) #endif -#define cond_resched_lock(lock) ({ \ - __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ - __cond_resched_lock(lock); \ +#define cond_resched_lock(lock) ({ \ + __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ + __cond_resched_lock(lock); \ }) -extern int __cond_resched_softirq(void); +#define cond_resched_rwlock_read(lock) ({ \ + __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ + __cond_resched_rwlock_read(lock); \ +}) -#define cond_resched_softirq() ({ \ - __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \ - __cond_resched_softirq(); \ +#define cond_resched_rwlock_write(lock) ({ \ + __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ + __cond_resched_rwlock_write(lock); \ }) -static inline void cond_resched_rcu(void) +#ifndef CONFIG_PREEMPT_RT +static inline struct mutex *__get_task_blocked_on(struct task_struct *p) { -#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) - rcu_read_unlock(); - cond_resched(); - rcu_read_lock(); -#endif -} + struct mutex *m = p->blocked_on; -/* - * Does a critical section need to be broken due to another - * task waiting?: (technically does not depend on CONFIG_PREEMPT, - * but a general need for low latency) - */ -static inline int spin_needbreak(spinlock_t *lock) -{ -#ifdef CONFIG_PREEMPT - return spin_is_contended(lock); -#else - return 0; -#endif + if (m) + lockdep_assert_held_once(&m->wait_lock); + return m; } -/* - * Idle thread specific functions to determine the need_resched - * polling state. We have two versions, one based on TS_POLLING in - * thread_info.status and one based on TIF_POLLING_NRFLAG in - * thread_info.flags - */ -#ifdef TS_POLLING -static inline int tsk_is_polling(struct task_struct *p) -{ - return task_thread_info(p)->status & TS_POLLING; -} -static inline void current_set_polling(void) +static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) { - current_thread_info()->status |= TS_POLLING; -} + struct mutex *blocked_on = READ_ONCE(p->blocked_on); -static inline void current_clr_polling(void) -{ - current_thread_info()->status &= ~TS_POLLING; - smp_mb__after_clear_bit(); + WARN_ON_ONCE(!m); + /* The task should only be setting itself as blocked */ + WARN_ON_ONCE(p != current); + /* Currently we serialize blocked_on under the mutex::wait_lock */ + lockdep_assert_held_once(&m->wait_lock); + /* + * Check ensure we don't overwrite existing mutex value + * with a different mutex. Note, setting it to the same + * lock repeatedly is ok. + */ + WARN_ON_ONCE(blocked_on && blocked_on != m); + WRITE_ONCE(p->blocked_on, m); } -#elif defined(TIF_POLLING_NRFLAG) -static inline int tsk_is_polling(struct task_struct *p) + +static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m) { - return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); + guard(raw_spinlock_irqsave)(&m->wait_lock); + __set_task_blocked_on(p, m); } -static inline void current_set_polling(void) + +static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m) { - set_thread_flag(TIF_POLLING_NRFLAG); + if (m) { + struct mutex *blocked_on = READ_ONCE(p->blocked_on); + + /* Currently we serialize blocked_on under the mutex::wait_lock */ + lockdep_assert_held_once(&m->wait_lock); + /* + * There may be cases where we re-clear already cleared + * blocked_on relationships, but make sure we are not + * clearing the relationship with a different lock. + */ + WARN_ON_ONCE(blocked_on && blocked_on != m); + } + WRITE_ONCE(p->blocked_on, NULL); } -static inline void current_clr_polling(void) +static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) { - clear_thread_flag(TIF_POLLING_NRFLAG); + guard(raw_spinlock_irqsave)(&m->wait_lock); + __clear_task_blocked_on(p, m); } #else -static inline int tsk_is_polling(struct task_struct *p) { return 0; } -static inline void current_set_polling(void) { } -static inline void current_clr_polling(void) { } -#endif - -/* - * Thread group CPU time accounting. - */ -void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times); -void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times); - -static inline void thread_group_cputime_init(struct signal_struct *sig) +static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { - raw_spin_lock_init(&sig->cputimer.lock); } -/* - * Reevaluate whether the task has signals pending delivery. - * Wake the task if so. - * This is required every time the blocked sigset_t changes. - * callers must hold sighand->siglock. - */ -extern void recalc_sigpending_and_wake(struct task_struct *t); -extern void recalc_sigpending(void); - -extern void signal_wake_up_state(struct task_struct *t, unsigned int state); - -static inline void signal_wake_up(struct task_struct *t, bool resume) +static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { - signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0); } -static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) +#endif /* !CONFIG_PREEMPT_RT */ + +static __always_inline bool need_resched(void) { - signal_wake_up_state(t, resume ? __TASK_TRACED : 0); + return unlikely(tif_need_resched()); } /* @@ -2534,7 +2210,7 @@ static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) static inline unsigned int task_cpu(const struct task_struct *p) { - return task_thread_info(p)->cpu; + return READ_ONCE(task_thread_info(p)->cpu); } extern void set_task_cpu(struct task_struct *p, unsigned int cpu); @@ -2552,91 +2228,216 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) #endif /* CONFIG_SMP */ -extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); -extern long sched_getaffinity(pid_t pid, struct cpumask *mask); - -#ifdef CONFIG_CGROUP_SCHED -extern struct task_group root_task_group; -#endif /* CONFIG_CGROUP_SCHED */ - -extern int task_can_switch_user(struct user_struct *up, - struct task_struct *tsk); - -#ifdef CONFIG_TASK_XACCT -static inline void add_rchar(struct task_struct *tsk, ssize_t amt) +static inline bool task_is_runnable(struct task_struct *p) { - tsk->ioac.rchar += amt; + return p->on_rq && !p->se.sched_delayed; } -static inline void add_wchar(struct task_struct *tsk, ssize_t amt) -{ - tsk->ioac.wchar += amt; -} +extern bool sched_task_on_rq(struct task_struct *p); +extern unsigned long get_wchan(struct task_struct *p); +extern struct task_struct *cpu_curr_snapshot(int cpu); -static inline void inc_syscr(struct task_struct *tsk) +/* + * In order to reduce various lock holder preemption latencies provide an + * interface to see if a vCPU is currently running or not. + * + * This allows us to terminate optimistic spin loops and block, analogous to + * the native optimistic spin heuristic of testing if the lock owner task is + * running or not. + */ +#ifndef vcpu_is_preempted +static inline bool vcpu_is_preempted(int cpu) { - tsk->ioac.syscr++; + return false; } +#endif + +extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); +extern long sched_getaffinity(pid_t pid, struct cpumask *mask); + +#ifndef TASK_SIZE_OF +#define TASK_SIZE_OF(tsk) TASK_SIZE +#endif -static inline void inc_syscw(struct task_struct *tsk) +static inline bool owner_on_cpu(struct task_struct *owner) { - tsk->ioac.syscw++; + /* + * As lock holder preemption issue, we both skip spinning if + * task is not on cpu or its cpu is preempted + */ + return READ_ONCE(owner->on_cpu) && !vcpu_is_preempted(task_cpu(owner)); } + +/* Returns effective CPU energy utilization, as seen by the scheduler */ +unsigned long sched_cpu_util(int cpu); + +#ifdef CONFIG_SCHED_CORE +extern void sched_core_free(struct task_struct *tsk); +extern void sched_core_fork(struct task_struct *p); +extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, + unsigned long uaddr); +extern int sched_core_idle_cpu(int cpu); #else -static inline void add_rchar(struct task_struct *tsk, ssize_t amt) +static inline void sched_core_free(struct task_struct *tsk) { } +static inline void sched_core_fork(struct task_struct *p) { } +static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } +#endif + +extern void sched_set_stop_task(int cpu, struct task_struct *stop); + +#ifdef CONFIG_MEM_ALLOC_PROFILING +static __always_inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { + swap(current->alloc_tag, tag); + return tag; } -static inline void add_wchar(struct task_struct *tsk, ssize_t amt) +static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) { +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n"); +#endif + current->alloc_tag = old; } +#else +#define alloc_tag_save(_tag) NULL +#define alloc_tag_restore(_tag, _old) do {} while (0) +#endif -static inline void inc_syscr(struct task_struct *tsk) +/* Avoids recursive inclusion hell */ +#ifdef CONFIG_SCHED_MM_CID +void sched_mm_cid_before_execve(struct task_struct *t); +void sched_mm_cid_after_execve(struct task_struct *t); +void sched_mm_cid_fork(struct task_struct *t); +void sched_mm_cid_exit(struct task_struct *t); +static __always_inline int task_mm_cid(struct task_struct *t) { + return t->mm_cid.cid & ~(MM_CID_ONCPU | MM_CID_TRANSIT); } - -static inline void inc_syscw(struct task_struct *tsk) +#else +static inline void sched_mm_cid_before_execve(struct task_struct *t) { } +static inline void sched_mm_cid_after_execve(struct task_struct *t) { } +static inline void sched_mm_cid_fork(struct task_struct *t) { } +static inline void sched_mm_cid_exit(struct task_struct *t) { } +static __always_inline int task_mm_cid(struct task_struct *t) { + /* + * Use the processor id as a fall-back when the mm cid feature is + * disabled. This provides functional per-cpu data structure accesses + * in user-space, althrough it won't provide the memory usage benefits. + */ + return task_cpu(t); } #endif -#ifndef TASK_SIZE_OF -#define TASK_SIZE_OF(tsk) TASK_SIZE -#endif +#ifndef MODULE +#ifndef COMPILE_OFFSETS + +extern void ___migrate_enable(void); -#ifdef CONFIG_MM_OWNER -extern void mm_update_next_owner(struct mm_struct *mm); -extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); +struct rq; +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + +/* + * The "struct rq" is not available here, so we can't access the + * "runqueues" with this_cpu_ptr(), as the compilation will fail in + * this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr(): + * typeof((ptr) + 0) + * + * So use arch_raw_cpu_ptr()/PERCPU_PTR() directly here. + */ +#ifdef CONFIG_SMP +#define this_rq_raw() arch_raw_cpu_ptr(&runqueues) #else -static inline void mm_update_next_owner(struct mm_struct *mm) -{ -} +#define this_rq_raw() PERCPU_PTR(&runqueues) +#endif +#define this_rq_pinned() (*(unsigned int *)((void *)this_rq_raw() + RQ_nr_pinned)) -static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) +static inline void __migrate_enable(void) { -} -#endif /* CONFIG_MM_OWNER */ + struct task_struct *p = current; -static inline unsigned long task_rlimit(const struct task_struct *tsk, - unsigned int limit) -{ - return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur); +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Check both overflow from migrate_disable() and superfluous + * migrate_enable(). + */ + if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) + return; +#endif + + if (p->migration_disabled > 1) { + p->migration_disabled--; + return; + } + + /* + * Ensure stop_task runs either before or after this, and that + * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). + */ + guard(preempt)(); + if (unlikely(p->cpus_ptr != &p->cpus_mask)) + ___migrate_enable(); + /* + * Mustn't clear migration_disabled() until cpus_ptr points back at the + * regular cpus_mask, otherwise things that race (eg. + * select_fallback_rq) get confused. + */ + barrier(); + p->migration_disabled = 0; + this_rq_pinned()--; } -static inline unsigned long task_rlimit_max(const struct task_struct *tsk, - unsigned int limit) +static inline void __migrate_disable(void) { - return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max); + struct task_struct *p = current; + + if (p->migration_disabled) { +#ifdef CONFIG_DEBUG_PREEMPT + /* + *Warn about overflow half-way through the range. + */ + WARN_ON_ONCE((s16)p->migration_disabled < 0); +#endif + p->migration_disabled++; + return; + } + + guard(preempt)(); + this_rq_pinned()++; + p->migration_disabled = 1; } +#else /* !COMPILE_OFFSETS */ +static inline void __migrate_disable(void) { } +static inline void __migrate_enable(void) { } +#endif /* !COMPILE_OFFSETS */ -static inline unsigned long rlimit(unsigned int limit) +/* + * So that it is possible to not export the runqueues variable, define and + * export migrate_enable/migrate_disable in kernel/sched/core.c too, and use + * them for the modules. The macro "INSTANTIATE_EXPORTED_MIGRATE_DISABLE" will + * be defined in kernel/sched/core.c. + */ +#ifndef INSTANTIATE_EXPORTED_MIGRATE_DISABLE +static __always_inline void migrate_disable(void) { - return task_rlimit(current, limit); + __migrate_disable(); } -static inline unsigned long rlimit_max(unsigned int limit) +static __always_inline void migrate_enable(void) { - return task_rlimit_max(current, limit); + __migrate_enable(); } +#else /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ +extern void migrate_disable(void); +extern void migrate_enable(void); +#endif /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ + +#else /* MODULE */ +extern void migrate_disable(void); +extern void migrate_enable(void); +#endif /* MODULE */ + +DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) #endif |
