diff options
Diffstat (limited to 'include/linux/sched.h')
| -rw-r--r-- | include/linux/sched.h | 986 |
1 files changed, 503 insertions, 483 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index 853d08f7562b..d395f2810fac 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -10,41 +10,56 @@ #include <uapi/linux/sched.h> #include <asm/current.h> - -#include <linux/pid.h> -#include <linux/sem.h> +#include <asm/processor.h> +#include <linux/thread_info.h> +#include <linux/preempt.h> +#include <linux/cpumask_types.h> + +#include <linux/cache.h> +#include <linux/irqflags_types.h> +#include <linux/smp_types.h> +#include <linux/pid_types.h> +#include <linux/sem_types.h> #include <linux/shm.h> #include <linux/kmsan_types.h> -#include <linux/mutex.h> -#include <linux/plist.h> -#include <linux/hrtimer.h> -#include <linux/irqflags.h> -#include <linux/seccomp.h> -#include <linux/nodemask.h> -#include <linux/rcupdate.h> -#include <linux/refcount.h> +#include <linux/mutex_types.h> +#include <linux/plist_types.h> +#include <linux/hrtimer_types.h> +#include <linux/timer_types.h> +#include <linux/seccomp_types.h> +#include <linux/nodemask_types.h> +#include <linux/refcount_types.h> #include <linux/resource.h> #include <linux/latencytop.h> #include <linux/sched/prio.h> #include <linux/sched/types.h> #include <linux/signal_types.h> -#include <linux/syscall_user_dispatch.h> +#include <linux/spinlock.h> +#include <linux/syscall_user_dispatch_types.h> #include <linux/mm_types_task.h> +#include <linux/netdevice_xmit.h> #include <linux/task_io_accounting.h> -#include <linux/posix-timers.h> -#include <linux/rseq.h> -#include <linux/seqlock.h> +#include <linux/posix-timers_types.h> +#include <linux/restart_block.h> +#include <linux/rseq_types.h> +#include <linux/seqlock_types.h> #include <linux/kcsan.h> #include <linux/rv.h> +#include <linux/uidgid_types.h> +#include <linux/tracepoint-defs.h> +#include <linux/unwind_deferred_types.h> #include <asm/kmap_size.h> +#ifndef COMPILE_OFFSETS +#include <generated/rq-offsets.h> +#endif /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; -struct backing_dev_info; struct bio_list; struct blk_plug; struct bpf_local_storage; struct bpf_run_ctx; +struct bpf_net_context; struct capture_control; struct cfs_rq; struct fs_struct; @@ -55,6 +70,7 @@ struct mempolicy; struct nameidata; struct nsproxy; struct perf_event_context; +struct perf_ctx_data; struct pid_namespace; struct pipe_inode_info; struct rcu_node; @@ -63,25 +79,29 @@ struct robust_list_head; struct root_domain; struct rq; struct sched_attr; -struct sched_param; +struct sched_dl_entity; struct seq_file; struct sighand_struct; struct signal_struct; struct task_delay_info; struct task_group; +struct task_struct; +struct user_event_mm; + +#include <linux/sched/ext.h> /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). * - * We have two separate sets of flags: task->state + * We have two separate sets of flags: task->__state * is about runnability, while task->exit_state are * about the task exiting. Confusing, but this way * modifying one set can't modify the other one by * mistake. */ -/* Used in tsk->state: */ +/* Used in tsk->__state: */ #define TASK_RUNNING 0x00000000 #define TASK_INTERRUPTIBLE 0x00000001 #define TASK_UNINTERRUPTIBLE 0x00000002 @@ -91,7 +111,7 @@ struct task_group; #define EXIT_DEAD 0x00000010 #define EXIT_ZOMBIE 0x00000020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) -/* Used in tsk->state again: */ +/* Used in tsk->__state again: */ #define TASK_PARKED 0x00000040 #define TASK_DEAD 0x00000080 #define TASK_WAKEKILL 0x00000100 @@ -137,8 +157,9 @@ struct task_group; * Special states are those that do not use the normal wait-loop pattern. See * the comment with set_special_state(). */ -#define is_special_task_state(state) \ - ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD)) +#define is_special_task_state(state) \ + ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | \ + TASK_DEAD | TASK_FROZEN)) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP # define debug_normal_state_change(state_value) \ @@ -171,8 +192,14 @@ struct task_group; # define debug_rtlock_wait_restore_state() do { } while (0) #endif +#define trace_set_current_state(state_value) \ + do { \ + if (tracepoint_enabled(sched_set_state_tp)) \ + __trace_set_current_state(state_value); \ + } while (0) + /* - * set_current_state() includes a barrier so that the write of current->state + * set_current_state() includes a barrier so that the write of current->__state * is correctly serialised wrt the caller's subsequent test of whether to * actually sleep: * @@ -195,9 +222,9 @@ struct task_group; * wake_up_state(p, TASK_UNINTERRUPTIBLE); * * where wake_up_state()/try_to_wake_up() executes a full memory barrier before - * accessing p->state. + * accessing p->__state. * - * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is, + * Wakeup will do: if (@state & p->__state) p->__state = TASK_RUNNING, that is, * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). * @@ -211,12 +238,14 @@ struct task_group; #define __set_current_state(state_value) \ do { \ debug_normal_state_change((state_value)); \ + trace_set_current_state(state_value); \ WRITE_ONCE(current->__state, (state_value)); \ } while (0) #define set_current_state(state_value) \ do { \ debug_normal_state_change((state_value)); \ + trace_set_current_state(state_value); \ smp_store_mb(current->__state, (state_value)); \ } while (0) @@ -232,6 +261,7 @@ struct task_group; \ raw_spin_lock_irqsave(¤t->pi_lock, flags); \ debug_special_state_change((state_value)); \ + trace_set_current_state(state_value); \ WRITE_ONCE(current->__state, (state_value)); \ raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \ } while (0) @@ -267,6 +297,7 @@ struct task_group; raw_spin_lock(¤t->pi_lock); \ current->saved_state = current->__state; \ debug_rtlock_wait_set_state(); \ + trace_set_current_state(TASK_RTLOCK_WAIT); \ WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \ raw_spin_unlock(¤t->pi_lock); \ } while (0); @@ -276,6 +307,7 @@ struct task_group; lockdep_assert_irqs_disabled(); \ raw_spin_lock(¤t->pi_lock); \ debug_rtlock_wait_restore_state(); \ + trace_set_current_state(current->saved_state); \ WRITE_ONCE(current->__state, current->saved_state); \ current->saved_state = TASK_RUNNING; \ raw_spin_unlock(¤t->pi_lock); \ @@ -291,7 +323,7 @@ enum { TASK_COMM_LEN = 16, }; -extern void scheduler_tick(void); +extern void sched_tick(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX @@ -312,6 +344,12 @@ extern void io_schedule_finish(int token); extern long io_schedule_timeout(long timeout); extern void io_schedule(void); +/* wrapper functions to trace from this header file */ +DECLARE_TRACEPOINT(sched_set_state_tp); +extern void __trace_set_current_state(int state_value); +DECLARE_TRACEPOINT(sched_set_need_resched_tp); +extern void __trace_set_need_resched(struct task_struct *curr, int tif); + /** * struct prev_cputime - snapshot of system and user cputime * @utime: time spent in user mode @@ -364,10 +402,14 @@ enum uclamp_id { UCLAMP_CNT }; -#ifdef CONFIG_SMP extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; -#endif +extern void sched_domains_mutex_lock(void); +extern void sched_domains_mutex_unlock(void); + +struct sched_param { + int sched_priority; +}; struct sched_info { #ifdef CONFIG_SCHED_INFO @@ -379,6 +421,12 @@ struct sched_info { /* Time spent waiting on a runqueue: */ unsigned long long run_delay; + /* Max time spent waiting on a runqueue: */ + unsigned long long max_run_delay; + + /* Min time spent waiting on a runqueue: */ + unsigned long long min_run_delay; + /* Timestamps: */ /* When did we last run on a CPU? */ @@ -409,42 +457,6 @@ struct load_weight { u32 inv_weight; }; -/** - * struct util_est - Estimation utilization of FAIR tasks - * @enqueued: instantaneous estimated utilization of a task/cpu - * @ewma: the Exponential Weighted Moving Average (EWMA) - * utilization of a task - * - * Support data structure to track an Exponential Weighted Moving Average - * (EWMA) of a FAIR task's utilization. New samples are added to the moving - * average each time a task completes an activation. Sample's weight is chosen - * so that the EWMA will be relatively insensitive to transient changes to the - * task's workload. - * - * The enqueued attribute has a slightly different meaning for tasks and cpus: - * - task: the task's util_avg at last task dequeue time - * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU - * Thus, the util_est.enqueued of a task represents the contribution on the - * estimated utilization of the CPU where that task is currently enqueued. - * - * Only for tasks we track a moving average of the past instantaneous - * estimated utilization. This allows to absorb sporadic drops in utilization - * of an otherwise almost periodic task. - * - * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg - * updates. When a task is dequeued, its util_est should not be updated if its - * util_avg has not been updated in the meantime. - * This information is mapped into the MSB bit of util_est.enqueued at dequeue - * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg - * for a task) it is safe to use MSB. - */ -struct util_est { - unsigned int enqueued; - unsigned int ewma; -#define UTIL_EST_WEIGHT_SHIFT 2 -#define UTIL_AVG_UNCHANGED 0x80000000 -} __attribute__((__aligned__(sizeof(u64)))); - /* * The load/runnable/util_avg accumulates an infinite geometric series * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). @@ -499,9 +511,20 @@ struct sched_avg { unsigned long load_avg; unsigned long runnable_avg; unsigned long util_avg; - struct util_est util_est; + unsigned int util_est; } ____cacheline_aligned; +/* + * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg + * updates. When a task is dequeued, its util_est should not be updated if its + * util_avg has not been updated in the meantime. + * This information is mapped into the MSB bit of util_est at dequeue time. + * Since max value of util_est for a task is 1024 (PELT util_avg for a task) + * it is safe to use MSB. + */ +#define UTIL_EST_WEIGHT_SHIFT 2 +#define UTIL_AVG_UNCHANGED 0x80000000 + struct sched_statistics { #ifdef CONFIG_SCHEDSTATS u64 wait_start; @@ -519,7 +542,7 @@ struct sched_statistics { u64 block_max; s64 sum_block_runtime; - u64 exec_max; + s64 exec_max; u64 slice_max; u64 nr_migrations_cold; @@ -548,13 +571,31 @@ struct sched_entity { /* For load-balancing: */ struct load_weight load; struct rb_node run_node; + u64 deadline; + u64 min_vruntime; + u64 min_slice; + struct list_head group_node; - unsigned int on_rq; + unsigned char on_rq; + unsigned char sched_delayed; + unsigned char rel_deadline; + unsigned char custom_slice; + /* hole */ u64 exec_start; u64 sum_exec_runtime; - u64 vruntime; u64 prev_sum_exec_runtime; + u64 vruntime; + union { + /* + * When !@on_rq this field is vlag. + * When cfs_rq->curr == se (which implies @on_rq) + * this field is vprot. See protect_slice(). + */ + s64 vlag; + u64 vprot; + }; + u64 slice; u64 nr_migrations; @@ -569,7 +610,6 @@ struct sched_entity { unsigned long runnable_weight; #endif -#ifdef CONFIG_SMP /* * Per entity load average tracking. * @@ -577,7 +617,6 @@ struct sched_entity { * collide with read-mostly values above. */ struct sched_avg avg; -#endif }; struct sched_rt_entity { @@ -598,6 +637,9 @@ struct sched_rt_entity { #endif } __randomize_layout; +struct rq_flags; +typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *, struct rq_flags *rf); + struct sched_dl_entity { struct rb_node rb_node; @@ -640,11 +682,36 @@ struct sched_dl_entity { * * @dl_overrun tells if the task asked to be informed about runtime * overruns. + * + * @dl_server tells if this is a server entity. + * + * @dl_server_active tells if the dlserver is active(started). + * dlserver is started on first cfs enqueue on an idle runqueue + * and is stopped when a dequeue results in 0 cfs tasks on the + * runqueue. In other words, dlserver is active only when cpu's + * runqueue has atleast one cfs task. + * + * @dl_defer tells if this is a deferred or regular server. For + * now only defer server exists. + * + * @dl_defer_armed tells if the deferrable server is waiting + * for the replenishment timer to activate it. + * + * @dl_defer_running tells if the deferrable server is actually + * running, skipping the defer phase. + * + * @dl_defer_idle tracks idle state */ unsigned int dl_throttled : 1; unsigned int dl_yielded : 1; unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; + unsigned int dl_server : 1; + unsigned int dl_server_active : 1; + unsigned int dl_defer : 1; + unsigned int dl_defer_armed : 1; + unsigned int dl_defer_running : 1; + unsigned int dl_defer_idle : 1; /* * Bandwidth enforcement timer. Each -deadline task has its @@ -659,7 +726,16 @@ struct sched_dl_entity { * timer is needed to decrease the active utilization at the correct * time. */ - struct hrtimer inactive_timer; + struct hrtimer inactive_timer; + + /* + * Bits for DL-server functionality. Also see the comment near + * dl_server_update(). + * + * @rq the runqueue this server is for + */ + struct rq *rq; + dl_server_pick_f server_pick_task; #ifdef CONFIG_RT_MUTEXES /* @@ -723,6 +799,12 @@ enum perf_event_task_context { perf_nr_task_contexts, }; +/* + * Number of contexts where an event can trigger: + * task, softirq, hardirq, nmi. + */ +#define PERF_NR_CONTEXTS 4 + struct wake_q_node { struct wake_q_node *next; }; @@ -744,10 +826,8 @@ struct task_struct { #endif unsigned int __state; -#ifdef CONFIG_PREEMPT_RT /* saved state for "spinlock sleepers" */ unsigned int saved_state; -#endif /* * This begins the randomizable portion of task_struct. Only @@ -761,7 +841,10 @@ struct task_struct { unsigned int flags; unsigned int ptrace; -#ifdef CONFIG_SMP +#ifdef CONFIG_MEM_ALLOC_PROFILING + struct alloc_tag *alloc_tag; +#endif + int on_cpu; struct __call_single_node wake_entry; unsigned int wakee_flips; @@ -777,7 +860,6 @@ struct task_struct { */ int recent_used_cpu; int wake_cpu; -#endif int on_rq; int prio; @@ -788,6 +870,10 @@ struct task_struct { struct sched_entity se; struct sched_rt_entity rt; struct sched_dl_entity dl; + struct sched_dl_entity *dl_server; +#ifdef CONFIG_SCHED_CLASS_EXT + struct sched_ext_entity scx; +#endif const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE @@ -798,8 +884,14 @@ struct task_struct { #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; +#ifdef CONFIG_CFS_BANDWIDTH + struct callback_head sched_throttle_work; + struct list_head throttle_node; + bool throttled; +#endif #endif + #ifdef CONFIG_UCLAMP_TASK /* * Clamp values requested for a scheduling entity. @@ -825,14 +917,13 @@ struct task_struct { #endif unsigned int policy; + unsigned long max_allowed_capacity; int nr_cpus_allowed; const cpumask_t *cpus_ptr; cpumask_t *user_cpus_ptr; cpumask_t cpus_mask; void *migration_pending; -#ifdef CONFIG_SMP unsigned short migration_disabled; -#endif unsigned short migration_flags; #ifdef CONFIG_PREEMPT_RCU @@ -848,6 +939,8 @@ struct task_struct { u8 rcu_tasks_idx; int rcu_tasks_idle_cpu; struct list_head rcu_tasks_holdout_list; + int rcu_tasks_exit_cpu; + struct list_head rcu_tasks_exit_list; #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU @@ -862,13 +955,12 @@ struct task_struct { struct sched_info sched_info; struct list_head tasks; -#ifdef CONFIG_SMP struct plist_node pushable_tasks; struct rb_node pushable_dl_tasks; -#endif struct mm_struct *mm; struct mm_struct *active_mm; + struct address_space *faults_disabled_mapping; int exit_state; int exit_code; @@ -885,6 +977,7 @@ struct task_struct { unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; + unsigned sched_task_hot:1; /* Force alignment to the next boundary: */ unsigned :0; @@ -905,14 +998,17 @@ struct task_struct { * ->sched_remote_wakeup gets used, so it can be in this word. */ unsigned sched_remote_wakeup:1; +#ifdef CONFIG_RT_MUTEXES + unsigned sched_rt_mutex:1; +#endif - /* Bit to tell LSMs we're in execve(): */ + /* Bit to tell TOMOYO we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; #ifndef TIF_RESTORE_SIGMASK unsigned restore_sigmask:1; #endif -#ifdef CONFIG_MEMCG +#ifdef CONFIG_MEMCG_V1 unsigned in_user_fault:1; #endif #ifdef CONFIG_LRU_GEN @@ -943,17 +1039,20 @@ struct task_struct { /* Recursion prevention for eventfd_signal() */ unsigned in_eventfd:1; #endif -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_ARCH_HAS_CPU_PASID unsigned pasid_activated:1; #endif -#ifdef CONFIG_CPU_SUP_INTEL +#ifdef CONFIG_X86_BUS_LOCK_DETECT unsigned reported_split_lock:1; #endif #ifdef CONFIG_TASK_DELAY_ACCT /* delay due to memory thrashing */ unsigned in_thrashing:1; #endif - + unsigned in_nf_duplicate:1; +#ifdef CONFIG_PREEMPT_RT + struct netdev_xmit net_xmit; +#endif unsigned long atomic_flags; /* Flags requiring atomic access. */ struct restart_block restart_block; @@ -996,7 +1095,6 @@ struct task_struct { /* PID/PID hash table linkage. */ struct pid *thread_pid; struct hlist_node pid_links[PIDTYPE_MAX]; - struct list_head thread_group; struct list_head thread_node; struct completion *vfork_done; @@ -1065,9 +1163,12 @@ struct task_struct { /* * executable name, excluding path. * - * - normally initialized setup_new_exec() - * - access it with [gs]et_task_comm() - * - lock it with task_lock() + * - normally initialized begin_new_exec() + * - set it with set_task_comm() + * - strscpy_pad() to ensure it is always NUL-terminated and + * zero-padded + * - task_lock() to ensure the operation is atomic and the name is + * fully updated. */ char comm[TASK_COMM_LEN]; @@ -1139,9 +1240,14 @@ struct task_struct { struct rt_mutex_waiter *pi_blocked_on; #endif -#ifdef CONFIG_DEBUG_MUTEXES - /* Mutex deadlock detection: */ - struct mutex_waiter *blocked_on; + struct mutex *blocked_on; /* lock we're blocked on */ + +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER + /* + * Encoded lock address causing task block (lower 2 bits = type from + * <linux/hung_task.h>). Accessed via hung_task_*() helpers. + */ + unsigned long blocker; #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP @@ -1184,8 +1290,6 @@ struct task_struct { /* VM state: */ struct reclaim_state *reclaim_state; - struct backing_dev_info *backing_dev_info; - struct io_context *io_context; #ifdef CONFIG_COMPACTION @@ -1214,14 +1318,16 @@ struct task_struct { /* Sequence number to catch updates: */ seqcount_spinlock_t mems_allowed_seq; int cpuset_mem_spread_rotor; - int cpuset_slab_spread_rotor; #endif #ifdef CONFIG_CGROUPS /* Control Group info protected by css_set_lock: */ struct css_set __rcu *cgroups; /* cg_list protected by css_set_lock and tsk->alloc_lock: */ struct list_head cg_list; -#endif +#ifdef CONFIG_PREEMPT_RT + struct llist_node cg_dead_lnode; +#endif /* CONFIG_PREEMPT_RT */ +#endif /* CONFIG_CGROUPS */ #ifdef CONFIG_X86_CPU_RESCTRL u32 closid; u32 rmid; @@ -1237,9 +1343,11 @@ struct task_struct { unsigned int futex_state; #endif #ifdef CONFIG_PERF_EVENTS + u8 perf_recursion[PERF_NR_CONTEXTS]; struct perf_event_context *perf_event_ctxp; struct mutex perf_event_mutex; struct list_head perf_event_list; + struct perf_ctx_data __rcu *perf_ctx_data; #endif #ifdef CONFIG_DEBUG_PREEMPT unsigned long preempt_disable_ip; @@ -1248,6 +1356,7 @@ struct task_struct { /* Protected by alloc_lock: */ struct mempolicy *mempolicy; short il_prev; + u8 il_weight; short pref_node_fork; #endif #ifdef CONFIG_NUMA_BALANCING @@ -1300,23 +1409,11 @@ struct task_struct { unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_RSEQ - struct rseq __user *rseq; - u32 rseq_sig; - /* - * RmW on rseq_event_mask must be performed atomically - * with respect to preemption. - */ - unsigned long rseq_event_mask; -#endif + struct rseq_data rseq; + struct sched_mm_cid mm_cid; struct tlbflush_unmap_batch tlb_ubc; - union { - refcount_t rcu_users; - struct rcu_head rcu; - }; - /* Cache last used pipe for splice(): */ struct pipe_inode_info *splice_pipe; @@ -1378,10 +1475,11 @@ struct task_struct { int curr_ret_depth; /* Stack of return addresses for return function tracing: */ - struct ftrace_ret_stack *ret_stack; + unsigned long *ret_stack; /* Timestamp for last schedule: */ unsigned long long ftrace_timestamp; + unsigned long long ftrace_sleeptime; /* * Number of functions that haven't been traced @@ -1423,20 +1521,23 @@ struct task_struct { unsigned int kcov_softirq; #endif -#ifdef CONFIG_MEMCG +#ifdef CONFIG_MEMCG_V1 struct mem_cgroup *memcg_in_oom; - gfp_t memcg_oom_gfp_mask; - int memcg_oom_order; +#endif +#ifdef CONFIG_MEMCG /* Number of pages to reclaim on returning to userland: */ unsigned int memcg_nr_pages_over_high; /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; + + /* Cache for current->cgroups->memcg->objcg lookups: */ + struct obj_cgroup *objcg; #endif #ifdef CONFIG_BLK_CGROUP - struct request_queue *throttle_queue; + struct gendisk *throttle_disk; #endif #ifdef CONFIG_UPROBES @@ -1453,6 +1554,8 @@ struct task_struct { unsigned long saved_state_change; # endif #endif + struct rcu_head rcu; + refcount_t rcu_users; int pagefault_disabled; #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; @@ -1478,9 +1581,13 @@ struct task_struct { /* Used for BPF run context */ struct bpf_run_ctx *bpf_ctx; #endif + /* Used by BPF for per-TASK xdp storage */ + struct bpf_net_context *bpf_net_context; -#ifdef CONFIG_GCC_PLUGIN_STACKLEAK +#ifdef CONFIG_KSTACK_ERASE unsigned long lowest_stack; +#endif +#ifdef CONFIG_KSTACK_ERASE_METRICS unsigned long prev_lowest_stack; #endif @@ -1514,138 +1621,42 @@ struct task_struct { #ifdef CONFIG_RV /* - * Per-task RV monitor. Nowadays fixed in RV_PER_TASK_MONITORS. - * If we find justification for more monitors, we can think - * about adding more or developing a dynamic method. So far, - * none of these are justified. + * Per-task RV monitor, fixed in CONFIG_RV_PER_TASK_MONITORS. + * If memory becomes a concern, we can think about a dynamic method. */ - union rv_task_monitor rv[RV_PER_TASK_MONITORS]; + union rv_task_monitor rv[CONFIG_RV_PER_TASK_MONITORS]; #endif - /* - * New fields for task_struct should be added above here, so that - * they are included in the randomized portion of task_struct. - */ - randomized_struct_fields_end +#ifdef CONFIG_USER_EVENTS + struct user_event_mm *user_event_mm; +#endif + +#ifdef CONFIG_UNWIND_USER + struct unwind_task_info unwind_info; +#endif /* CPU-specific state of this task: */ struct thread_struct thread; /* - * WARNING: on x86, 'thread_struct' contains a variable-sized - * structure. It *MUST* be at the end of 'task_struct'. - * - * Do not put anything below here! + * New fields for task_struct should be added above here, so that + * they are included in the randomized portion of task_struct. */ -}; - -static inline struct pid *task_pid(struct task_struct *task) -{ - return task->thread_pid; -} - -/* - * the helpers to get the task's different pids as they are seen - * from various namespaces - * - * task_xid_nr() : global id, i.e. the id seen from the init namespace; - * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of - * current. - * task_xid_nr_ns() : id seen from the ns specified; - * - * see also pid_nr() etc in include/linux/pid.h - */ -pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns); - -static inline pid_t task_pid_nr(struct task_struct *tsk) -{ - return tsk->pid; -} - -static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns); -} - -static inline pid_t task_pid_vnr(struct task_struct *tsk) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); -} - - -static inline pid_t task_tgid_nr(struct task_struct *tsk) -{ - return tsk->tgid; -} - -/** - * pid_alive - check that a task structure is not stale - * @p: Task structure to be checked. - * - * Test if a process is not yet dead (at most zombie state) - * If pid_alive fails, then pointers within the task structure - * can be stale and must not be dereferenced. - * - * Return: 1 if the process is alive. 0 otherwise. - */ -static inline int pid_alive(const struct task_struct *p) -{ - return p->thread_pid != NULL; -} - -static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); -} - -static inline pid_t task_pgrp_vnr(struct task_struct *tsk) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL); -} - - -static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns); -} - -static inline pid_t task_session_vnr(struct task_struct *tsk) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); -} - -static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns); -} - -static inline pid_t task_tgid_vnr(struct task_struct *tsk) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL); -} - -static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) -{ - pid_t pid = 0; - - rcu_read_lock(); - if (pid_alive(tsk)) - pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); - rcu_read_unlock(); - - return pid; -} + randomized_struct_fields_end +} __attribute__ ((aligned (64))); -static inline pid_t task_ppid_nr(const struct task_struct *tsk) +#ifdef CONFIG_SCHED_PROXY_EXEC +DECLARE_STATIC_KEY_TRUE(__sched_proxy_exec); +static inline bool sched_proxy_exec(void) { - return task_ppid_nr_ns(tsk, &init_pid_ns); + return static_branch_likely(&__sched_proxy_exec); } - -/* Obsolete, do not use: */ -static inline pid_t task_pgrp_nr(struct task_struct *tsk) +#else +static inline bool sched_proxy_exec(void) { - return task_pgrp_nr_ns(tsk, &init_pid_ns); + return false; } +#endif #define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) @@ -1657,15 +1668,16 @@ static inline unsigned int __task_state_index(unsigned int tsk_state, BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); - if (tsk_state == TASK_IDLE) + if ((tsk_state & TASK_IDLE) == TASK_IDLE) state = TASK_REPORT_IDLE; /* * We're lying here, but rather than expose a completely new task state * to userspace, we can make this appear as if the task has gone through * a regular rt_mutex_lock() call. + * Report frozen tasks as uninterruptible. */ - if (tsk_state == TASK_RTLOCK_WAIT) + if ((tsk_state & TASK_RTLOCK_WAIT) || (tsk_state & TASK_FROZEN)) state = TASK_UNINTERRUPTIBLE; return fls(state); @@ -1680,7 +1692,7 @@ static inline char task_index_to_char(unsigned int state) { static const char state_char[] = "RSDTtXZPI"; - BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1); + BUILD_BUG_ON(TASK_REPORT_MAX * 2 != 1 << (sizeof(state_char) - 1)); return state_char[state]; } @@ -1690,20 +1702,6 @@ static inline char task_state_to_char(struct task_struct *tsk) return task_index_to_char(task_state_index(tsk)); } -/** - * is_global_init - check if a task structure is init. Since init - * is free to have sub-threads we need to check tgid. - * @tsk: Task structure to be checked. - * - * Check if a task structure is the first user space task the kernel created. - * - * Return: 1 if the task structure is init. 0 otherwise. - */ -static inline int is_global_init(struct task_struct *tsk) -{ - return task_tgid_nr(tsk) == 1; -} - extern struct pid *cad_pid; /* @@ -1720,15 +1718,15 @@ extern struct pid *cad_pid; #define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* Dumped core */ #define PF_SIGNALED 0x00000400 /* Killed by a signal */ -#define PF_MEMALLOC 0x00000800 /* Allocating memory */ +#define PF_MEMALLOC 0x00000800 /* Allocating memory to free memory. See memalloc_noreclaim_save() */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ -#define PF__HOLE__00004000 0x00004000 +#define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ -#define PF__HOLE__00010000 0x00010000 +#define PF_KCOMPACTD 0x00010000 /* I am kcompactd */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ -#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ -#define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ +#define PF_MEMALLOC_NOFS 0x00040000 /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */ +#define PF_MEMALLOC_NOIO 0x00080000 /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */ #define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to, * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ @@ -1738,8 +1736,9 @@ extern struct pid *cad_pid; #define PF__HOLE__02000000 0x02000000 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ -#define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ -#define PF__HOLE__20000000 0x20000000 +#define PF_MEMALLOC_PIN 0x10000000 /* Allocations constrained to zones which allow long term pinning. + * See memalloc_pin_save() */ +#define PF_BLOCK_TS 0x20000000 /* plug has ts that needs updating */ #define PF__HOLE__40000000 0x40000000 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ @@ -1773,12 +1772,8 @@ extern struct pid *cad_pid; static __always_inline bool is_percpu_thread(void) { -#ifdef CONFIG_SMP return (current->flags & PF_NO_SETAFFINITY) && (current->nr_cpus_allowed == 1); -#else - return true; -#endif } /* Per-process atomic flags. */ @@ -1840,41 +1835,26 @@ current_restore_flags(unsigned long orig_flags, unsigned long flags) } extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); -extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_effective_cpus); -#ifdef CONFIG_SMP -extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); +extern int task_can_attach(struct task_struct *p); +extern int dl_bw_alloc(int cpu, u64 dl_bw); +extern void dl_bw_free(int cpu, u64 dl_bw); + +/* set_cpus_allowed_force() - consider using set_cpus_allowed_ptr() instead */ +extern void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask); + +/** + * set_cpus_allowed_ptr - set CPU affinity mask of a task + * @p: the task + * @new_mask: CPU affinity mask + * + * Return: zero if successful, or a negative error code + */ extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node); extern void release_user_cpus_ptr(struct task_struct *p); extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask); extern void force_compatible_cpus_allowed_ptr(struct task_struct *p); extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p); -#else -static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -{ -} -static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -{ - if (!cpumask_test_cpu(0, new_mask)) - return -EINVAL; - return 0; -} -static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node) -{ - if (src->user_cpus_ptr) - return -EINVAL; - return 0; -} -static inline void release_user_cpus_ptr(struct task_struct *p) -{ - WARN_ON(p->user_cpus_ptr); -} - -static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) -{ - return 0; -} -#endif extern int yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); @@ -1899,6 +1879,7 @@ extern int sched_setscheduler(struct task_struct *, int, const struct sched_para extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); extern void sched_set_fifo(struct task_struct *p); extern void sched_set_fifo_low(struct task_struct *p); +extern void sched_set_fifo_secondary(struct task_struct *p); extern void sched_set_normal(struct task_struct *p, int nice); extern int sched_setattr(struct task_struct *, const struct sched_attr *); extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); @@ -1921,9 +1902,7 @@ extern void ia64_set_curr_task(int cpu, struct task_struct *p); void yield(void); union thread_union { -#ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK struct task_struct task; -#endif #ifndef CONFIG_THREAD_INFO_IN_TASK struct thread_info thread_info; #endif @@ -1938,7 +1917,7 @@ extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)]; #ifdef CONFIG_THREAD_INFO_IN_TASK # define task_thread_info(task) (&(task)->thread_info) -#elif !defined(__HAVE_THREAD_FUNCTIONS) +#else # define task_thread_info(task) ((struct thread_info *)(task)->stack) #endif @@ -1965,26 +1944,33 @@ extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); extern void wake_up_new_task(struct task_struct *tsk); -#ifdef CONFIG_SMP extern void kick_process(struct task_struct *tsk); -#else -static inline void kick_process(struct task_struct *tsk) { } -#endif extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); +#define set_task_comm(tsk, from) ({ \ + BUILD_BUG_ON(sizeof(from) != TASK_COMM_LEN); \ + __set_task_comm(tsk, from, false); \ +}) -static inline void set_task_comm(struct task_struct *tsk, const char *from) -{ - __set_task_comm(tsk, from, false); -} - -extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); +/* + * - Why not use task_lock()? + * User space can randomly change their names anyway, so locking for readers + * doesn't make sense. For writers, locking is probably necessary, as a race + * condition could lead to long-term mixed results. + * The strscpy_pad() in __set_task_comm() can ensure that the task comm is + * always NUL-terminated and zero-padded. Therefore the race condition between + * reader and writer is not an issue. + * + * - BUILD_BUG_ON() can help prevent the buf from being truncated. + * Since the callers don't perform any return value checks, this safeguard is + * necessary. + */ #define get_task_comm(buf, tsk) ({ \ - BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN); \ - __get_task_comm(buf, sizeof(buf), tsk); \ + BUILD_BUG_ON(sizeof(buf) < TASK_COMM_LEN); \ + strscpy_pad(buf, (tsk)->comm); \ + buf; \ }) -#ifdef CONFIG_SMP static __always_inline void scheduler_ipi(void) { /* @@ -1994,14 +1980,8 @@ static __always_inline void scheduler_ipi(void) */ preempt_fold_need_resched(); } + extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); -#else -static inline void scheduler_ipi(void) { } -static inline unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) -{ - return 1; -} -#endif /* * Set thread flags in other task's structures. @@ -2040,12 +2020,16 @@ static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) static inline void set_tsk_need_resched(struct task_struct *tsk) { + if (tracepoint_enabled(sched_set_need_resched_tp) && + !test_tsk_thread_flag(tsk, TIF_NEED_RESCHED)) + __trace_set_need_resched(tsk, TIF_NEED_RESCHED); set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); } static inline void clear_tsk_need_resched(struct task_struct *tsk) { - clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); + atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY, + (atomic_long_t *)&task_thread_info(tsk)->flags); } static inline int test_tsk_need_resched(struct task_struct *tsk) @@ -2053,6 +2037,13 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } +static inline void set_need_resched_current(void) +{ + lockdep_assert_irqs_disabled(); + set_tsk_need_resched(current); + set_preempt_need_resched(); +} + /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return @@ -2072,6 +2063,7 @@ static __always_inline int _cond_resched(void) } #elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) + extern int dynamic_cond_resched(void); static __always_inline int _cond_resched(void) @@ -2079,20 +2071,23 @@ static __always_inline int _cond_resched(void) return dynamic_cond_resched(); } -#else +#else /* !CONFIG_PREEMPTION */ static inline int _cond_resched(void) { return __cond_resched(); } -#endif /* CONFIG_PREEMPT_DYNAMIC */ +#endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ -#else +#else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */ -static inline int _cond_resched(void) { return 0; } +static inline int _cond_resched(void) +{ + return 0; +} -#endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */ +#endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */ #define cond_resched() ({ \ __might_resched(__FILE__, __LINE__, 0); \ @@ -2137,86 +2132,71 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock); __cond_resched_rwlock_write(lock); \ }) -static inline void cond_resched_rcu(void) +#ifndef CONFIG_PREEMPT_RT +static inline struct mutex *__get_task_blocked_on(struct task_struct *p) { -#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) - rcu_read_unlock(); - cond_resched(); - rcu_read_lock(); -#endif -} - -#ifdef CONFIG_PREEMPT_DYNAMIC + struct mutex *m = p->blocked_on; -extern bool preempt_model_none(void); -extern bool preempt_model_voluntary(void); -extern bool preempt_model_full(void); - -#else - -static inline bool preempt_model_none(void) -{ - return IS_ENABLED(CONFIG_PREEMPT_NONE); -} -static inline bool preempt_model_voluntary(void) -{ - return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY); + if (m) + lockdep_assert_held_once(&m->wait_lock); + return m; } -static inline bool preempt_model_full(void) + +static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) { - return IS_ENABLED(CONFIG_PREEMPT); -} + struct mutex *blocked_on = READ_ONCE(p->blocked_on); -#endif + WARN_ON_ONCE(!m); + /* The task should only be setting itself as blocked */ + WARN_ON_ONCE(p != current); + /* Currently we serialize blocked_on under the mutex::wait_lock */ + lockdep_assert_held_once(&m->wait_lock); + /* + * Check ensure we don't overwrite existing mutex value + * with a different mutex. Note, setting it to the same + * lock repeatedly is ok. + */ + WARN_ON_ONCE(blocked_on && blocked_on != m); + WRITE_ONCE(p->blocked_on, m); +} -static inline bool preempt_model_rt(void) +static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m) { - return IS_ENABLED(CONFIG_PREEMPT_RT); + guard(raw_spinlock_irqsave)(&m->wait_lock); + __set_task_blocked_on(p, m); } -/* - * Does the preemption model allow non-cooperative preemption? - * - * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with - * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the - * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the - * PREEMPT_NONE model. - */ -static inline bool preempt_model_preemptible(void) +static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m) { - return preempt_model_full() || preempt_model_rt(); + if (m) { + struct mutex *blocked_on = READ_ONCE(p->blocked_on); + + /* Currently we serialize blocked_on under the mutex::wait_lock */ + lockdep_assert_held_once(&m->wait_lock); + /* + * There may be cases where we re-clear already cleared + * blocked_on relationships, but make sure we are not + * clearing the relationship with a different lock. + */ + WARN_ON_ONCE(blocked_on && blocked_on != m); + } + WRITE_ONCE(p->blocked_on, NULL); } -/* - * Does a critical section need to be broken due to another - * task waiting?: (technically does not depend on CONFIG_PREEMPTION, - * but a general need for low latency) - */ -static inline int spin_needbreak(spinlock_t *lock) +static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) { -#ifdef CONFIG_PREEMPTION - return spin_is_contended(lock); + guard(raw_spinlock_irqsave)(&m->wait_lock); + __clear_task_blocked_on(p, m); +} #else - return 0; -#endif +static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) +{ } -/* - * Check if a rwlock is contended. - * Returns non-zero if there is another task waiting on the rwlock. - * Returns zero if the lock is not contended or the system / underlying - * rwlock implementation does not support contention detection. - * Technically does not depend on CONFIG_PREEMPTION, but a general need - * for low latency. - */ -static inline int rwlock_needbreak(rwlock_t *lock) +static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { -#ifdef CONFIG_PREEMPTION - return rwlock_is_contended(lock); -#else - return 0; -#endif } +#endif /* !CONFIG_PREEMPT_RT */ static __always_inline bool need_resched(void) { @@ -2248,6 +2228,11 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) #endif /* CONFIG_SMP */ +static inline bool task_is_runnable(struct task_struct *p) +{ + return p->on_rq && !p->se.sched_delayed; +} + extern bool sched_task_on_rq(struct task_struct *p); extern unsigned long get_wchan(struct task_struct *p); extern struct task_struct *cpu_curr_snapshot(int cpu); @@ -2274,7 +2259,6 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask); #define TASK_SIZE_OF(tsk) TASK_SIZE #endif -#ifdef CONFIG_SMP static inline bool owner_on_cpu(struct task_struct *owner) { /* @@ -2286,138 +2270,174 @@ static inline bool owner_on_cpu(struct task_struct *owner) /* Returns effective CPU energy utilization, as seen by the scheduler */ unsigned long sched_cpu_util(int cpu); -#endif /* CONFIG_SMP */ - -#ifdef CONFIG_RSEQ -/* - * Map the event mask on the user-space ABI enum rseq_cs_flags - * for direct mask checks. - */ -enum rseq_event_mask_bits { - RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, - RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, - RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, -}; +#ifdef CONFIG_SCHED_CORE +extern void sched_core_free(struct task_struct *tsk); +extern void sched_core_fork(struct task_struct *p); +extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, + unsigned long uaddr); +extern int sched_core_idle_cpu(int cpu); +#else +static inline void sched_core_free(struct task_struct *tsk) { } +static inline void sched_core_fork(struct task_struct *p) { } +static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } +#endif -enum rseq_event_mask { - RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), - RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), - RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), -}; +extern void sched_set_stop_task(int cpu, struct task_struct *stop); -static inline void rseq_set_notify_resume(struct task_struct *t) +#ifdef CONFIG_MEM_ALLOC_PROFILING +static __always_inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { - if (t->rseq) - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); + swap(current->alloc_tag, tag); + return tag; } -void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); - -static inline void rseq_handle_notify_resume(struct ksignal *ksig, - struct pt_regs *regs) +static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) { - if (current->rseq) - __rseq_handle_notify_resume(ksig, regs); +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n"); +#endif + current->alloc_tag = old; } +#else +#define alloc_tag_save(_tag) NULL +#define alloc_tag_restore(_tag, _old) do {} while (0) +#endif -static inline void rseq_signal_deliver(struct ksignal *ksig, - struct pt_regs *regs) +/* Avoids recursive inclusion hell */ +#ifdef CONFIG_SCHED_MM_CID +void sched_mm_cid_before_execve(struct task_struct *t); +void sched_mm_cid_after_execve(struct task_struct *t); +void sched_mm_cid_fork(struct task_struct *t); +void sched_mm_cid_exit(struct task_struct *t); +static __always_inline int task_mm_cid(struct task_struct *t) { - preempt_disable(); - __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); - preempt_enable(); - rseq_handle_notify_resume(ksig, regs); + return t->mm_cid.cid & ~(MM_CID_ONCPU | MM_CID_TRANSIT); } - -/* rseq_preempt() requires preemption to be disabled. */ -static inline void rseq_preempt(struct task_struct *t) +#else +static inline void sched_mm_cid_before_execve(struct task_struct *t) { } +static inline void sched_mm_cid_after_execve(struct task_struct *t) { } +static inline void sched_mm_cid_fork(struct task_struct *t) { } +static inline void sched_mm_cid_exit(struct task_struct *t) { } +static __always_inline int task_mm_cid(struct task_struct *t) { - __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); - rseq_set_notify_resume(t); + /* + * Use the processor id as a fall-back when the mm cid feature is + * disabled. This provides functional per-cpu data structure accesses + * in user-space, althrough it won't provide the memory usage benefits. + */ + return task_cpu(t); } +#endif -/* rseq_migrate() requires preemption to be disabled. */ -static inline void rseq_migrate(struct task_struct *t) -{ - __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); - rseq_set_notify_resume(t); -} +#ifndef MODULE +#ifndef COMPILE_OFFSETS + +extern void ___migrate_enable(void); + +struct rq; +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); /* - * If parent process has a registered restartable sequences area, the - * child inherits. Unregister rseq for a clone with CLONE_VM set. + * The "struct rq" is not available here, so we can't access the + * "runqueues" with this_cpu_ptr(), as the compilation will fail in + * this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr(): + * typeof((ptr) + 0) + * + * So use arch_raw_cpu_ptr()/PERCPU_PTR() directly here. */ -static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) -{ - if (clone_flags & CLONE_VM) { - t->rseq = NULL; - t->rseq_sig = 0; - t->rseq_event_mask = 0; - } else { - t->rseq = current->rseq; - t->rseq_sig = current->rseq_sig; - t->rseq_event_mask = current->rseq_event_mask; - } -} +#ifdef CONFIG_SMP +#define this_rq_raw() arch_raw_cpu_ptr(&runqueues) +#else +#define this_rq_raw() PERCPU_PTR(&runqueues) +#endif +#define this_rq_pinned() (*(unsigned int *)((void *)this_rq_raw() + RQ_nr_pinned)) -static inline void rseq_execve(struct task_struct *t) +static inline void __migrate_enable(void) { - t->rseq = NULL; - t->rseq_sig = 0; - t->rseq_event_mask = 0; -} + struct task_struct *p = current; -#else +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Check both overflow from migrate_disable() and superfluous + * migrate_enable(). + */ + if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) + return; +#endif -static inline void rseq_set_notify_resume(struct task_struct *t) -{ -} -static inline void rseq_handle_notify_resume(struct ksignal *ksig, - struct pt_regs *regs) -{ -} -static inline void rseq_signal_deliver(struct ksignal *ksig, - struct pt_regs *regs) -{ -} -static inline void rseq_preempt(struct task_struct *t) -{ -} -static inline void rseq_migrate(struct task_struct *t) -{ -} -static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) -{ + if (p->migration_disabled > 1) { + p->migration_disabled--; + return; + } + + /* + * Ensure stop_task runs either before or after this, and that + * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). + */ + guard(preempt)(); + if (unlikely(p->cpus_ptr != &p->cpus_mask)) + ___migrate_enable(); + /* + * Mustn't clear migration_disabled() until cpus_ptr points back at the + * regular cpus_mask, otherwise things that race (eg. + * select_fallback_rq) get confused. + */ + barrier(); + p->migration_disabled = 0; + this_rq_pinned()--; } -static inline void rseq_execve(struct task_struct *t) + +static inline void __migrate_disable(void) { -} + struct task_struct *p = current; + if (p->migration_disabled) { +#ifdef CONFIG_DEBUG_PREEMPT + /* + *Warn about overflow half-way through the range. + */ + WARN_ON_ONCE((s16)p->migration_disabled < 0); #endif + p->migration_disabled++; + return; + } -#ifdef CONFIG_DEBUG_RSEQ - -void rseq_syscall(struct pt_regs *regs); + guard(preempt)(); + this_rq_pinned()++; + p->migration_disabled = 1; +} +#else /* !COMPILE_OFFSETS */ +static inline void __migrate_disable(void) { } +static inline void __migrate_enable(void) { } +#endif /* !COMPILE_OFFSETS */ -#else +/* + * So that it is possible to not export the runqueues variable, define and + * export migrate_enable/migrate_disable in kernel/sched/core.c too, and use + * them for the modules. The macro "INSTANTIATE_EXPORTED_MIGRATE_DISABLE" will + * be defined in kernel/sched/core.c. + */ +#ifndef INSTANTIATE_EXPORTED_MIGRATE_DISABLE +static __always_inline void migrate_disable(void) +{ + __migrate_disable(); +} -static inline void rseq_syscall(struct pt_regs *regs) +static __always_inline void migrate_enable(void) { + __migrate_enable(); } +#else /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ +extern void migrate_disable(void); +extern void migrate_enable(void); +#endif /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ -#endif +#else /* MODULE */ +extern void migrate_disable(void); +extern void migrate_enable(void); +#endif /* MODULE */ -#ifdef CONFIG_SCHED_CORE -extern void sched_core_free(struct task_struct *tsk); -extern void sched_core_fork(struct task_struct *p); -extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, - unsigned long uaddr); -#else -static inline void sched_core_free(struct task_struct *tsk) { } -static inline void sched_core_fork(struct task_struct *p) { } -#endif - -extern void sched_set_stop_task(int cpu, struct task_struct *stop); +DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) #endif |
