diff options
Diffstat (limited to 'include/linux/sched.h')
-rw-r--r-- | include/linux/sched.h | 276 |
1 files changed, 183 insertions, 93 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index 3c2abbc587b4..4f78a64beb52 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -13,7 +13,7 @@ #include <asm/processor.h> #include <linux/thread_info.h> #include <linux/preempt.h> -#include <linux/cpumask.h> +#include <linux/cpumask_types.h> #include <linux/cache.h> #include <linux/irqflags_types.h> @@ -36,6 +36,7 @@ #include <linux/signal_types.h> #include <linux/syscall_user_dispatch_types.h> #include <linux/mm_types_task.h> +#include <linux/netdevice_xmit.h> #include <linux/task_io_accounting.h> #include <linux/posix-timers_types.h> #include <linux/restart_block.h> @@ -43,8 +44,8 @@ #include <linux/seqlock_types.h> #include <linux/kcsan.h> #include <linux/rv.h> -#include <linux/livepatch_sched.h> #include <linux/uidgid_types.h> +#include <linux/tracepoint-defs.h> #include <asm/kmap_size.h> /* task_struct member predeclarations (sorted alphabetically): */ @@ -53,6 +54,7 @@ struct bio_list; struct blk_plug; struct bpf_local_storage; struct bpf_run_ctx; +struct bpf_net_context; struct capture_control; struct cfs_rq; struct fs_struct; @@ -63,6 +65,7 @@ struct mempolicy; struct nameidata; struct nsproxy; struct perf_event_context; +struct perf_ctx_data; struct pid_namespace; struct pipe_inode_info; struct rcu_node; @@ -80,6 +83,8 @@ struct task_group; struct task_struct; struct user_event_mm; +#include <linux/sched/ext.h> + /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). @@ -147,8 +152,9 @@ struct user_event_mm; * Special states are those that do not use the normal wait-loop pattern. See * the comment with set_special_state(). */ -#define is_special_task_state(state) \ - ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD)) +#define is_special_task_state(state) \ + ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | \ + TASK_DEAD | TASK_FROZEN)) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP # define debug_normal_state_change(state_value) \ @@ -181,6 +187,12 @@ struct user_event_mm; # define debug_rtlock_wait_restore_state() do { } while (0) #endif +#define trace_set_current_state(state_value) \ + do { \ + if (tracepoint_enabled(sched_set_state_tp)) \ + __trace_set_current_state(state_value); \ + } while (0) + /* * set_current_state() includes a barrier so that the write of current->__state * is correctly serialised wrt the caller's subsequent test of whether to @@ -221,12 +233,14 @@ struct user_event_mm; #define __set_current_state(state_value) \ do { \ debug_normal_state_change((state_value)); \ + trace_set_current_state(state_value); \ WRITE_ONCE(current->__state, (state_value)); \ } while (0) #define set_current_state(state_value) \ do { \ debug_normal_state_change((state_value)); \ + trace_set_current_state(state_value); \ smp_store_mb(current->__state, (state_value)); \ } while (0) @@ -242,6 +256,7 @@ struct user_event_mm; \ raw_spin_lock_irqsave(¤t->pi_lock, flags); \ debug_special_state_change((state_value)); \ + trace_set_current_state(state_value); \ WRITE_ONCE(current->__state, (state_value)); \ raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \ } while (0) @@ -277,6 +292,7 @@ struct user_event_mm; raw_spin_lock(¤t->pi_lock); \ current->saved_state = current->__state; \ debug_rtlock_wait_set_state(); \ + trace_set_current_state(TASK_RTLOCK_WAIT); \ WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \ raw_spin_unlock(¤t->pi_lock); \ } while (0); @@ -286,6 +302,7 @@ struct user_event_mm; lockdep_assert_irqs_disabled(); \ raw_spin_lock(¤t->pi_lock); \ debug_rtlock_wait_restore_state(); \ + trace_set_current_state(current->saved_state); \ WRITE_ONCE(current->__state, current->saved_state); \ current->saved_state = TASK_RUNNING; \ raw_spin_unlock(¤t->pi_lock); \ @@ -301,7 +318,7 @@ enum { TASK_COMM_LEN = 16, }; -extern void scheduler_tick(void); +extern void sched_tick(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX @@ -322,6 +339,10 @@ extern void io_schedule_finish(int token); extern long io_schedule_timeout(long timeout); extern void io_schedule(void); +/* wrapper function to trace from this header file */ +DECLARE_TRACEPOINT(sched_set_state_tp); +extern void __trace_set_current_state(int state_value); + /** * struct prev_cputime - snapshot of system and user cputime * @utime: time spent in user mode @@ -377,6 +398,11 @@ enum uclamp_id { #ifdef CONFIG_SMP extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; +extern void sched_domains_mutex_lock(void); +extern void sched_domains_mutex_unlock(void); +#else +static inline void sched_domains_mutex_lock(void) { } +static inline void sched_domains_mutex_unlock(void) { } #endif struct sched_param { @@ -393,6 +419,12 @@ struct sched_info { /* Time spent waiting on a runqueue: */ unsigned long long run_delay; + /* Max time spent waiting on a runqueue: */ + unsigned long long max_run_delay; + + /* Min time spent waiting on a runqueue: */ + unsigned long long min_run_delay; + /* Timestamps: */ /* When did we last run on a CPU? */ @@ -516,6 +548,10 @@ struct sched_statistics { u64 nr_failed_migrations_running; u64 nr_failed_migrations_hot; u64 nr_forced_migrations; +#ifdef CONFIG_NUMA_BALANCING + u64 numa_task_migrated; + u64 numa_task_swapped; +#endif u64 nr_wakeups; u64 nr_wakeups_sync; @@ -539,9 +575,14 @@ struct sched_entity { struct rb_node run_node; u64 deadline; u64 min_vruntime; + u64 min_slice; struct list_head group_node; - unsigned int on_rq; + unsigned char on_rq; + unsigned char sched_delayed; + unsigned char rel_deadline; + unsigned char custom_slice; + /* hole */ u64 exec_start; u64 sum_exec_runtime; @@ -637,12 +678,33 @@ struct sched_dl_entity { * * @dl_overrun tells if the task asked to be informed about runtime * overruns. + * + * @dl_server tells if this is a server entity. + * + * @dl_defer tells if this is a deferred or regular server. For + * now only defer server exists. + * + * @dl_defer_armed tells if the deferrable server is waiting + * for the replenishment timer to activate it. + * + * @dl_server_active tells if the dlserver is active(started). + * dlserver is started on first cfs enqueue on an idle runqueue + * and is stopped when a dequeue results in 0 cfs tasks on the + * runqueue. In other words, dlserver is active only when cpu's + * runqueue has atleast one cfs task. + * + * @dl_defer_running tells if the deferrable server is actually + * running, skipping the defer phase. */ unsigned int dl_throttled : 1; unsigned int dl_yielded : 1; unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; unsigned int dl_server : 1; + unsigned int dl_server_active : 1; + unsigned int dl_defer : 1; + unsigned int dl_defer_armed : 1; + unsigned int dl_defer_running : 1; /* * Bandwidth enforcement timer. Each -deadline task has its @@ -670,7 +732,7 @@ struct sched_dl_entity { */ struct rq *rq; dl_server_has_tasks_f server_has_tasks; - dl_server_pick_f server_pick; + dl_server_pick_f server_pick_task; #ifdef CONFIG_RT_MUTEXES /* @@ -734,6 +796,12 @@ enum perf_event_task_context { perf_nr_task_contexts, }; +/* + * Number of contexts where an event can trigger: + * task, softirq, hardirq, nmi. + */ +#define PERF_NR_CONTEXTS 4 + struct wake_q_node { struct wake_q_node *next; }; @@ -770,6 +838,10 @@ struct task_struct { unsigned int flags; unsigned int ptrace; +#ifdef CONFIG_MEM_ALLOC_PROFILING + struct alloc_tag *alloc_tag; +#endif + #ifdef CONFIG_SMP int on_cpu; struct __call_single_node wake_entry; @@ -798,6 +870,9 @@ struct task_struct { struct sched_rt_entity rt; struct sched_dl_entity dl; struct sched_dl_entity *dl_server; +#ifdef CONFIG_SCHED_CLASS_EXT + struct sched_ext_entity scx; +#endif const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE @@ -810,6 +885,7 @@ struct task_struct { struct task_group *sched_task_group; #endif + #ifdef CONFIG_UCLAMP_TASK /* * Clamp values requested for a scheduling entity. @@ -835,6 +911,7 @@ struct task_struct { #endif unsigned int policy; + unsigned long max_allowed_capacity; int nr_cpus_allowed; const cpumask_t *cpus_ptr; cpumask_t *user_cpus_ptr; @@ -898,6 +975,7 @@ struct task_struct { unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; + unsigned sched_task_hot:1; /* Force alignment to the next boundary: */ unsigned :0; @@ -928,7 +1006,7 @@ struct task_struct { #ifndef TIF_RESTORE_SIGMASK unsigned restore_sigmask:1; #endif -#ifdef CONFIG_MEMCG +#ifdef CONFIG_MEMCG_V1 unsigned in_user_fault:1; #endif #ifdef CONFIG_LRU_GEN @@ -962,14 +1040,17 @@ struct task_struct { #ifdef CONFIG_ARCH_HAS_CPU_PASID unsigned pasid_activated:1; #endif -#ifdef CONFIG_CPU_SUP_INTEL +#ifdef CONFIG_X86_BUS_LOCK_DETECT unsigned reported_split_lock:1; #endif #ifdef CONFIG_TASK_DELAY_ACCT /* delay due to memory thrashing */ unsigned in_thrashing:1; #endif - + unsigned in_nf_duplicate:1; +#ifdef CONFIG_PREEMPT_RT + struct netdev_xmit net_xmit; +#endif unsigned long atomic_flags; /* Flags requiring atomic access. */ struct restart_block restart_block; @@ -1080,9 +1161,12 @@ struct task_struct { /* * executable name, excluding path. * - * - normally initialized setup_new_exec() - * - access it with [gs]et_task_comm() - * - lock it with task_lock() + * - normally initialized begin_new_exec() + * - set it with set_task_comm() + * - strscpy_pad() to ensure it is always NUL-terminated and + * zero-padded + * - task_lock() to ensure the operation is atomic and the name is + * fully updated. */ char comm[TASK_COMM_LEN]; @@ -1159,6 +1243,14 @@ struct task_struct { struct mutex_waiter *blocked_on; #endif +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER + /* + * Encoded lock address causing task block (lower 2 bits = type from + * <linux/hung_task.h>). Accessed via hung_task_*() helpers. + */ + unsigned long blocker; +#endif + #ifdef CONFIG_DEBUG_ATOMIC_SLEEP int non_block_count; #endif @@ -1227,7 +1319,6 @@ struct task_struct { /* Sequence number to catch updates: */ seqcount_spinlock_t mems_allowed_seq; int cpuset_mem_spread_rotor; - int cpuset_slab_spread_rotor; #endif #ifdef CONFIG_CGROUPS /* Control Group info protected by css_set_lock: */ @@ -1250,9 +1341,11 @@ struct task_struct { unsigned int futex_state; #endif #ifdef CONFIG_PERF_EVENTS + u8 perf_recursion[PERF_NR_CONTEXTS]; struct perf_event_context *perf_event_ctxp; struct mutex perf_event_mutex; struct list_head perf_event_list; + struct perf_ctx_data __rcu *perf_ctx_data; #endif #ifdef CONFIG_DEBUG_PREEMPT unsigned long preempt_disable_ip; @@ -1323,6 +1416,15 @@ struct task_struct { * with respect to preemption. */ unsigned long rseq_event_mask; +# ifdef CONFIG_DEBUG_RSEQ + /* + * This is a place holder to save a copy of the rseq fields for + * validation of read-only fields. The struct rseq has a + * variable-length array at the end, so it cannot be used + * directly. Reserve a size large enough for the known fields. + */ + char rseq_fields[sizeof(struct rseq)]; +# endif #endif #ifdef CONFIG_SCHED_MM_CID @@ -1396,10 +1498,11 @@ struct task_struct { int curr_ret_depth; /* Stack of return addresses for return function tracing: */ - struct ftrace_ret_stack *ret_stack; + unsigned long *ret_stack; /* Timestamp for last schedule: */ unsigned long long ftrace_timestamp; + unsigned long long ftrace_sleeptime; /* * Number of functions that haven't been traced @@ -1441,19 +1544,18 @@ struct task_struct { unsigned int kcov_softirq; #endif -#ifdef CONFIG_MEMCG +#ifdef CONFIG_MEMCG_V1 struct mem_cgroup *memcg_in_oom; - gfp_t memcg_oom_gfp_mask; - int memcg_oom_order; +#endif +#ifdef CONFIG_MEMCG /* Number of pages to reclaim on returning to userland: */ unsigned int memcg_nr_pages_over_high; /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; -#endif -#ifdef CONFIG_MEMCG_KMEM + /* Cache for current->cgroups->memcg->objcg lookups: */ struct obj_cgroup *objcg; #endif @@ -1502,6 +1604,8 @@ struct task_struct { /* Used for BPF run context */ struct bpf_run_ctx *bpf_ctx; #endif + /* Used by BPF for per-TASK xdp storage */ + struct bpf_net_context *bpf_net_context; #ifdef CONFIG_GCC_PLUGIN_STACKLEAK unsigned long lowest_stack; @@ -1550,22 +1654,15 @@ struct task_struct { struct user_event_mm *user_event_mm; #endif - /* - * New fields for task_struct should be added above here, so that - * they are included in the randomized portion of task_struct. - */ - randomized_struct_fields_end - /* CPU-specific state of this task: */ struct thread_struct thread; /* - * WARNING: on x86, 'thread_struct' contains a variable-sized - * structure. It *MUST* be at the end of 'task_struct'. - * - * Do not put anything below here! + * New fields for task_struct should be added above here, so that + * they are included in the randomized portion of task_struct. */ -}; + randomized_struct_fields_end +} __attribute__ ((aligned (64))); #define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) @@ -1584,8 +1681,9 @@ static inline unsigned int __task_state_index(unsigned int tsk_state, * We're lying here, but rather than expose a completely new task state * to userspace, we can make this appear as if the task has gone through * a regular rt_mutex_lock() call. + * Report frozen tasks as uninterruptible. */ - if (tsk_state & TASK_RTLOCK_WAIT) + if ((tsk_state & TASK_RTLOCK_WAIT) || (tsk_state & TASK_FROZEN)) state = TASK_UNINTERRUPTIBLE; return fls(state); @@ -1600,7 +1698,7 @@ static inline char task_index_to_char(unsigned int state) { static const char state_char[] = "RSDTtXZPI"; - BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1); + BUILD_BUG_ON(TASK_REPORT_MAX * 2 != 1 << (sizeof(state_char) - 1)); return state_char[state]; } @@ -1631,7 +1729,7 @@ extern struct pid *cad_pid; #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ #define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ -#define PF__HOLE__00010000 0x00010000 +#define PF_KCOMPACTD 0x00010000 /* I am kcompactd */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_MEMALLOC_NOFS 0x00040000 /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */ #define PF_MEMALLOC_NOIO 0x00080000 /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */ @@ -1639,8 +1737,8 @@ extern struct pid *cad_pid; * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ -#define PF_MEMALLOC_NORECLAIM 0x00800000 /* All allocation requests will clear __GFP_DIRECT_RECLAIM */ -#define PF_MEMALLOC_NOWARN 0x01000000 /* All allocation requests will inherit __GFP_NOWARN */ +#define PF__HOLE__00800000 0x00800000 +#define PF__HOLE__01000000 0x01000000 #define PF__HOLE__02000000 0x02000000 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ @@ -1774,7 +1872,8 @@ static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpuma } static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { - if (!cpumask_test_cpu(0, new_mask)) + /* Opencoded cpumask_test_cpu(0, new_mask) to avoid dependency on cpumask.h */ + if ((*cpumask_bits(new_mask) & 1) == 0) return -EINVAL; return 0; } @@ -1855,7 +1954,7 @@ extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)]; #ifdef CONFIG_THREAD_INFO_IN_TASK # define task_thread_info(task) (&(task)->thread_info) -#elif !defined(__HAVE_THREAD_FUNCTIONS) +#else # define task_thread_info(task) ((struct thread_info *)(task)->stack) #endif @@ -1889,16 +1988,28 @@ static inline void kick_process(struct task_struct *tsk) { } #endif extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); +#define set_task_comm(tsk, from) ({ \ + BUILD_BUG_ON(sizeof(from) != TASK_COMM_LEN); \ + __set_task_comm(tsk, from, false); \ +}) -static inline void set_task_comm(struct task_struct *tsk, const char *from) -{ - __set_task_comm(tsk, from, false); -} - -extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); +/* + * - Why not use task_lock()? + * User space can randomly change their names anyway, so locking for readers + * doesn't make sense. For writers, locking is probably necessary, as a race + * condition could lead to long-term mixed results. + * The strscpy_pad() in __set_task_comm() can ensure that the task comm is + * always NUL-terminated and zero-padded. Therefore the race condition between + * reader and writer is not an issue. + * + * - BUILD_BUG_ON() can help prevent the buf from being truncated. + * Since the callers don't perform any return value checks, this safeguard is + * necessary. + */ #define get_task_comm(buf, tsk) ({ \ - BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN); \ - __get_task_comm(buf, sizeof(buf), tsk); \ + BUILD_BUG_ON(sizeof(buf) < TASK_COMM_LEN); \ + strscpy_pad(buf, (tsk)->comm); \ + buf; \ }) #ifdef CONFIG_SMP @@ -1959,7 +2070,8 @@ static inline void set_tsk_need_resched(struct task_struct *tsk) static inline void clear_tsk_need_resched(struct task_struct *tsk) { - clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); + atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY, + (atomic_long_t *)&task_thread_info(tsk)->flags); } static inline int test_tsk_need_resched(struct task_struct *tsk) @@ -1978,9 +2090,6 @@ extern int __cond_resched(void); #if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -void sched_dynamic_klp_enable(void); -void sched_dynamic_klp_disable(void); - DECLARE_STATIC_CALL(cond_resched, __cond_resched); static __always_inline int _cond_resched(void) @@ -2001,7 +2110,6 @@ static __always_inline int _cond_resched(void) static inline int _cond_resched(void) { - klp_sched_try_switch(); return __cond_resched(); } @@ -2011,7 +2119,6 @@ static inline int _cond_resched(void) static inline int _cond_resched(void) { - klp_sched_try_switch(); return 0; } @@ -2060,47 +2167,6 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock); __cond_resched_rwlock_write(lock); \ }) -#ifdef CONFIG_PREEMPT_DYNAMIC - -extern bool preempt_model_none(void); -extern bool preempt_model_voluntary(void); -extern bool preempt_model_full(void); - -#else - -static inline bool preempt_model_none(void) -{ - return IS_ENABLED(CONFIG_PREEMPT_NONE); -} -static inline bool preempt_model_voluntary(void) -{ - return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY); -} -static inline bool preempt_model_full(void) -{ - return IS_ENABLED(CONFIG_PREEMPT); -} - -#endif - -static inline bool preempt_model_rt(void) -{ - return IS_ENABLED(CONFIG_PREEMPT_RT); -} - -/* - * Does the preemption model allow non-cooperative preemption? - * - * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with - * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the - * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the - * PREEMPT_NONE model. - */ -static inline bool preempt_model_preemptible(void) -{ - return preempt_model_full() || preempt_model_rt(); -} - static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); @@ -2131,6 +2197,11 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) #endif /* CONFIG_SMP */ +static inline bool task_is_runnable(struct task_struct *p) +{ + return p->on_rq && !p->se.sched_delayed; +} + extern bool sched_task_on_rq(struct task_struct *p); extern unsigned long get_wchan(struct task_struct *p); extern struct task_struct *cpu_curr_snapshot(int cpu); @@ -2187,4 +2258,23 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } extern void sched_set_stop_task(int cpu, struct task_struct *stop); +#ifdef CONFIG_MEM_ALLOC_PROFILING +static __always_inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) +{ + swap(current->alloc_tag, tag); + return tag; +} + +static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) +{ +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n"); +#endif + current->alloc_tag = old; +} +#else +#define alloc_tag_save(_tag) NULL +#define alloc_tag_restore(_tag, _old) do {} while (0) +#endif + #endif |