summaryrefslogtreecommitdiff
path: root/include/linux/sched.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/sched.h')
-rw-r--r--include/linux/sched.h986
1 files changed, 503 insertions, 483 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 853d08f7562b..d395f2810fac 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -10,41 +10,56 @@
#include <uapi/linux/sched.h>
#include <asm/current.h>
-
-#include <linux/pid.h>
-#include <linux/sem.h>
+#include <asm/processor.h>
+#include <linux/thread_info.h>
+#include <linux/preempt.h>
+#include <linux/cpumask_types.h>
+
+#include <linux/cache.h>
+#include <linux/irqflags_types.h>
+#include <linux/smp_types.h>
+#include <linux/pid_types.h>
+#include <linux/sem_types.h>
#include <linux/shm.h>
#include <linux/kmsan_types.h>
-#include <linux/mutex.h>
-#include <linux/plist.h>
-#include <linux/hrtimer.h>
-#include <linux/irqflags.h>
-#include <linux/seccomp.h>
-#include <linux/nodemask.h>
-#include <linux/rcupdate.h>
-#include <linux/refcount.h>
+#include <linux/mutex_types.h>
+#include <linux/plist_types.h>
+#include <linux/hrtimer_types.h>
+#include <linux/timer_types.h>
+#include <linux/seccomp_types.h>
+#include <linux/nodemask_types.h>
+#include <linux/refcount_types.h>
#include <linux/resource.h>
#include <linux/latencytop.h>
#include <linux/sched/prio.h>
#include <linux/sched/types.h>
#include <linux/signal_types.h>
-#include <linux/syscall_user_dispatch.h>
+#include <linux/spinlock.h>
+#include <linux/syscall_user_dispatch_types.h>
#include <linux/mm_types_task.h>
+#include <linux/netdevice_xmit.h>
#include <linux/task_io_accounting.h>
-#include <linux/posix-timers.h>
-#include <linux/rseq.h>
-#include <linux/seqlock.h>
+#include <linux/posix-timers_types.h>
+#include <linux/restart_block.h>
+#include <linux/rseq_types.h>
+#include <linux/seqlock_types.h>
#include <linux/kcsan.h>
#include <linux/rv.h>
+#include <linux/uidgid_types.h>
+#include <linux/tracepoint-defs.h>
+#include <linux/unwind_deferred_types.h>
#include <asm/kmap_size.h>
+#ifndef COMPILE_OFFSETS
+#include <generated/rq-offsets.h>
+#endif
/* task_struct member predeclarations (sorted alphabetically): */
struct audit_context;
-struct backing_dev_info;
struct bio_list;
struct blk_plug;
struct bpf_local_storage;
struct bpf_run_ctx;
+struct bpf_net_context;
struct capture_control;
struct cfs_rq;
struct fs_struct;
@@ -55,6 +70,7 @@ struct mempolicy;
struct nameidata;
struct nsproxy;
struct perf_event_context;
+struct perf_ctx_data;
struct pid_namespace;
struct pipe_inode_info;
struct rcu_node;
@@ -63,25 +79,29 @@ struct robust_list_head;
struct root_domain;
struct rq;
struct sched_attr;
-struct sched_param;
+struct sched_dl_entity;
struct seq_file;
struct sighand_struct;
struct signal_struct;
struct task_delay_info;
struct task_group;
+struct task_struct;
+struct user_event_mm;
+
+#include <linux/sched/ext.h>
/*
* Task state bitmask. NOTE! These bits are also
* encoded in fs/proc/array.c: get_task_state().
*
- * We have two separate sets of flags: task->state
+ * We have two separate sets of flags: task->__state
* is about runnability, while task->exit_state are
* about the task exiting. Confusing, but this way
* modifying one set can't modify the other one by
* mistake.
*/
-/* Used in tsk->state: */
+/* Used in tsk->__state: */
#define TASK_RUNNING 0x00000000
#define TASK_INTERRUPTIBLE 0x00000001
#define TASK_UNINTERRUPTIBLE 0x00000002
@@ -91,7 +111,7 @@ struct task_group;
#define EXIT_DEAD 0x00000010
#define EXIT_ZOMBIE 0x00000020
#define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD)
-/* Used in tsk->state again: */
+/* Used in tsk->__state again: */
#define TASK_PARKED 0x00000040
#define TASK_DEAD 0x00000080
#define TASK_WAKEKILL 0x00000100
@@ -137,8 +157,9 @@ struct task_group;
* Special states are those that do not use the normal wait-loop pattern. See
* the comment with set_special_state().
*/
-#define is_special_task_state(state) \
- ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))
+#define is_special_task_state(state) \
+ ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | \
+ TASK_DEAD | TASK_FROZEN))
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
# define debug_normal_state_change(state_value) \
@@ -171,8 +192,14 @@ struct task_group;
# define debug_rtlock_wait_restore_state() do { } while (0)
#endif
+#define trace_set_current_state(state_value) \
+ do { \
+ if (tracepoint_enabled(sched_set_state_tp)) \
+ __trace_set_current_state(state_value); \
+ } while (0)
+
/*
- * set_current_state() includes a barrier so that the write of current->state
+ * set_current_state() includes a barrier so that the write of current->__state
* is correctly serialised wrt the caller's subsequent test of whether to
* actually sleep:
*
@@ -195,9 +222,9 @@ struct task_group;
* wake_up_state(p, TASK_UNINTERRUPTIBLE);
*
* where wake_up_state()/try_to_wake_up() executes a full memory barrier before
- * accessing p->state.
+ * accessing p->__state.
*
- * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is,
+ * Wakeup will do: if (@state & p->__state) p->__state = TASK_RUNNING, that is,
* once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a
* TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING).
*
@@ -211,12 +238,14 @@ struct task_group;
#define __set_current_state(state_value) \
do { \
debug_normal_state_change((state_value)); \
+ trace_set_current_state(state_value); \
WRITE_ONCE(current->__state, (state_value)); \
} while (0)
#define set_current_state(state_value) \
do { \
debug_normal_state_change((state_value)); \
+ trace_set_current_state(state_value); \
smp_store_mb(current->__state, (state_value)); \
} while (0)
@@ -232,6 +261,7 @@ struct task_group;
\
raw_spin_lock_irqsave(&current->pi_lock, flags); \
debug_special_state_change((state_value)); \
+ trace_set_current_state(state_value); \
WRITE_ONCE(current->__state, (state_value)); \
raw_spin_unlock_irqrestore(&current->pi_lock, flags); \
} while (0)
@@ -267,6 +297,7 @@ struct task_group;
raw_spin_lock(&current->pi_lock); \
current->saved_state = current->__state; \
debug_rtlock_wait_set_state(); \
+ trace_set_current_state(TASK_RTLOCK_WAIT); \
WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \
raw_spin_unlock(&current->pi_lock); \
} while (0);
@@ -276,6 +307,7 @@ struct task_group;
lockdep_assert_irqs_disabled(); \
raw_spin_lock(&current->pi_lock); \
debug_rtlock_wait_restore_state(); \
+ trace_set_current_state(current->saved_state); \
WRITE_ONCE(current->__state, current->saved_state); \
current->saved_state = TASK_RUNNING; \
raw_spin_unlock(&current->pi_lock); \
@@ -291,7 +323,7 @@ enum {
TASK_COMM_LEN = 16,
};
-extern void scheduler_tick(void);
+extern void sched_tick(void);
#define MAX_SCHEDULE_TIMEOUT LONG_MAX
@@ -312,6 +344,12 @@ extern void io_schedule_finish(int token);
extern long io_schedule_timeout(long timeout);
extern void io_schedule(void);
+/* wrapper functions to trace from this header file */
+DECLARE_TRACEPOINT(sched_set_state_tp);
+extern void __trace_set_current_state(int state_value);
+DECLARE_TRACEPOINT(sched_set_need_resched_tp);
+extern void __trace_set_need_resched(struct task_struct *curr, int tif);
+
/**
* struct prev_cputime - snapshot of system and user cputime
* @utime: time spent in user mode
@@ -364,10 +402,14 @@ enum uclamp_id {
UCLAMP_CNT
};
-#ifdef CONFIG_SMP
extern struct root_domain def_root_domain;
extern struct mutex sched_domains_mutex;
-#endif
+extern void sched_domains_mutex_lock(void);
+extern void sched_domains_mutex_unlock(void);
+
+struct sched_param {
+ int sched_priority;
+};
struct sched_info {
#ifdef CONFIG_SCHED_INFO
@@ -379,6 +421,12 @@ struct sched_info {
/* Time spent waiting on a runqueue: */
unsigned long long run_delay;
+ /* Max time spent waiting on a runqueue: */
+ unsigned long long max_run_delay;
+
+ /* Min time spent waiting on a runqueue: */
+ unsigned long long min_run_delay;
+
/* Timestamps: */
/* When did we last run on a CPU? */
@@ -409,42 +457,6 @@ struct load_weight {
u32 inv_weight;
};
-/**
- * struct util_est - Estimation utilization of FAIR tasks
- * @enqueued: instantaneous estimated utilization of a task/cpu
- * @ewma: the Exponential Weighted Moving Average (EWMA)
- * utilization of a task
- *
- * Support data structure to track an Exponential Weighted Moving Average
- * (EWMA) of a FAIR task's utilization. New samples are added to the moving
- * average each time a task completes an activation. Sample's weight is chosen
- * so that the EWMA will be relatively insensitive to transient changes to the
- * task's workload.
- *
- * The enqueued attribute has a slightly different meaning for tasks and cpus:
- * - task: the task's util_avg at last task dequeue time
- * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
- * Thus, the util_est.enqueued of a task represents the contribution on the
- * estimated utilization of the CPU where that task is currently enqueued.
- *
- * Only for tasks we track a moving average of the past instantaneous
- * estimated utilization. This allows to absorb sporadic drops in utilization
- * of an otherwise almost periodic task.
- *
- * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
- * updates. When a task is dequeued, its util_est should not be updated if its
- * util_avg has not been updated in the meantime.
- * This information is mapped into the MSB bit of util_est.enqueued at dequeue
- * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
- * for a task) it is safe to use MSB.
- */
-struct util_est {
- unsigned int enqueued;
- unsigned int ewma;
-#define UTIL_EST_WEIGHT_SHIFT 2
-#define UTIL_AVG_UNCHANGED 0x80000000
-} __attribute__((__aligned__(sizeof(u64))));
-
/*
* The load/runnable/util_avg accumulates an infinite geometric series
* (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
@@ -499,9 +511,20 @@ struct sched_avg {
unsigned long load_avg;
unsigned long runnable_avg;
unsigned long util_avg;
- struct util_est util_est;
+ unsigned int util_est;
} ____cacheline_aligned;
+/*
+ * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
+ * updates. When a task is dequeued, its util_est should not be updated if its
+ * util_avg has not been updated in the meantime.
+ * This information is mapped into the MSB bit of util_est at dequeue time.
+ * Since max value of util_est for a task is 1024 (PELT util_avg for a task)
+ * it is safe to use MSB.
+ */
+#define UTIL_EST_WEIGHT_SHIFT 2
+#define UTIL_AVG_UNCHANGED 0x80000000
+
struct sched_statistics {
#ifdef CONFIG_SCHEDSTATS
u64 wait_start;
@@ -519,7 +542,7 @@ struct sched_statistics {
u64 block_max;
s64 sum_block_runtime;
- u64 exec_max;
+ s64 exec_max;
u64 slice_max;
u64 nr_migrations_cold;
@@ -548,13 +571,31 @@ struct sched_entity {
/* For load-balancing: */
struct load_weight load;
struct rb_node run_node;
+ u64 deadline;
+ u64 min_vruntime;
+ u64 min_slice;
+
struct list_head group_node;
- unsigned int on_rq;
+ unsigned char on_rq;
+ unsigned char sched_delayed;
+ unsigned char rel_deadline;
+ unsigned char custom_slice;
+ /* hole */
u64 exec_start;
u64 sum_exec_runtime;
- u64 vruntime;
u64 prev_sum_exec_runtime;
+ u64 vruntime;
+ union {
+ /*
+ * When !@on_rq this field is vlag.
+ * When cfs_rq->curr == se (which implies @on_rq)
+ * this field is vprot. See protect_slice().
+ */
+ s64 vlag;
+ u64 vprot;
+ };
+ u64 slice;
u64 nr_migrations;
@@ -569,7 +610,6 @@ struct sched_entity {
unsigned long runnable_weight;
#endif
-#ifdef CONFIG_SMP
/*
* Per entity load average tracking.
*
@@ -577,7 +617,6 @@ struct sched_entity {
* collide with read-mostly values above.
*/
struct sched_avg avg;
-#endif
};
struct sched_rt_entity {
@@ -598,6 +637,9 @@ struct sched_rt_entity {
#endif
} __randomize_layout;
+struct rq_flags;
+typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *, struct rq_flags *rf);
+
struct sched_dl_entity {
struct rb_node rb_node;
@@ -640,11 +682,36 @@ struct sched_dl_entity {
*
* @dl_overrun tells if the task asked to be informed about runtime
* overruns.
+ *
+ * @dl_server tells if this is a server entity.
+ *
+ * @dl_server_active tells if the dlserver is active(started).
+ * dlserver is started on first cfs enqueue on an idle runqueue
+ * and is stopped when a dequeue results in 0 cfs tasks on the
+ * runqueue. In other words, dlserver is active only when cpu's
+ * runqueue has atleast one cfs task.
+ *
+ * @dl_defer tells if this is a deferred or regular server. For
+ * now only defer server exists.
+ *
+ * @dl_defer_armed tells if the deferrable server is waiting
+ * for the replenishment timer to activate it.
+ *
+ * @dl_defer_running tells if the deferrable server is actually
+ * running, skipping the defer phase.
+ *
+ * @dl_defer_idle tracks idle state
*/
unsigned int dl_throttled : 1;
unsigned int dl_yielded : 1;
unsigned int dl_non_contending : 1;
unsigned int dl_overrun : 1;
+ unsigned int dl_server : 1;
+ unsigned int dl_server_active : 1;
+ unsigned int dl_defer : 1;
+ unsigned int dl_defer_armed : 1;
+ unsigned int dl_defer_running : 1;
+ unsigned int dl_defer_idle : 1;
/*
* Bandwidth enforcement timer. Each -deadline task has its
@@ -659,7 +726,16 @@ struct sched_dl_entity {
* timer is needed to decrease the active utilization at the correct
* time.
*/
- struct hrtimer inactive_timer;
+ struct hrtimer inactive_timer;
+
+ /*
+ * Bits for DL-server functionality. Also see the comment near
+ * dl_server_update().
+ *
+ * @rq the runqueue this server is for
+ */
+ struct rq *rq;
+ dl_server_pick_f server_pick_task;
#ifdef CONFIG_RT_MUTEXES
/*
@@ -723,6 +799,12 @@ enum perf_event_task_context {
perf_nr_task_contexts,
};
+/*
+ * Number of contexts where an event can trigger:
+ * task, softirq, hardirq, nmi.
+ */
+#define PERF_NR_CONTEXTS 4
+
struct wake_q_node {
struct wake_q_node *next;
};
@@ -744,10 +826,8 @@ struct task_struct {
#endif
unsigned int __state;
-#ifdef CONFIG_PREEMPT_RT
/* saved state for "spinlock sleepers" */
unsigned int saved_state;
-#endif
/*
* This begins the randomizable portion of task_struct. Only
@@ -761,7 +841,10 @@ struct task_struct {
unsigned int flags;
unsigned int ptrace;
-#ifdef CONFIG_SMP
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+ struct alloc_tag *alloc_tag;
+#endif
+
int on_cpu;
struct __call_single_node wake_entry;
unsigned int wakee_flips;
@@ -777,7 +860,6 @@ struct task_struct {
*/
int recent_used_cpu;
int wake_cpu;
-#endif
int on_rq;
int prio;
@@ -788,6 +870,10 @@ struct task_struct {
struct sched_entity se;
struct sched_rt_entity rt;
struct sched_dl_entity dl;
+ struct sched_dl_entity *dl_server;
+#ifdef CONFIG_SCHED_CLASS_EXT
+ struct sched_ext_entity scx;
+#endif
const struct sched_class *sched_class;
#ifdef CONFIG_SCHED_CORE
@@ -798,8 +884,14 @@ struct task_struct {
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
+#ifdef CONFIG_CFS_BANDWIDTH
+ struct callback_head sched_throttle_work;
+ struct list_head throttle_node;
+ bool throttled;
+#endif
#endif
+
#ifdef CONFIG_UCLAMP_TASK
/*
* Clamp values requested for a scheduling entity.
@@ -825,14 +917,13 @@ struct task_struct {
#endif
unsigned int policy;
+ unsigned long max_allowed_capacity;
int nr_cpus_allowed;
const cpumask_t *cpus_ptr;
cpumask_t *user_cpus_ptr;
cpumask_t cpus_mask;
void *migration_pending;
-#ifdef CONFIG_SMP
unsigned short migration_disabled;
-#endif
unsigned short migration_flags;
#ifdef CONFIG_PREEMPT_RCU
@@ -848,6 +939,8 @@ struct task_struct {
u8 rcu_tasks_idx;
int rcu_tasks_idle_cpu;
struct list_head rcu_tasks_holdout_list;
+ int rcu_tasks_exit_cpu;
+ struct list_head rcu_tasks_exit_list;
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_TRACE_RCU
@@ -862,13 +955,12 @@ struct task_struct {
struct sched_info sched_info;
struct list_head tasks;
-#ifdef CONFIG_SMP
struct plist_node pushable_tasks;
struct rb_node pushable_dl_tasks;
-#endif
struct mm_struct *mm;
struct mm_struct *active_mm;
+ struct address_space *faults_disabled_mapping;
int exit_state;
int exit_code;
@@ -885,6 +977,7 @@ struct task_struct {
unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;
unsigned sched_migrated:1;
+ unsigned sched_task_hot:1;
/* Force alignment to the next boundary: */
unsigned :0;
@@ -905,14 +998,17 @@ struct task_struct {
* ->sched_remote_wakeup gets used, so it can be in this word.
*/
unsigned sched_remote_wakeup:1;
+#ifdef CONFIG_RT_MUTEXES
+ unsigned sched_rt_mutex:1;
+#endif
- /* Bit to tell LSMs we're in execve(): */
+ /* Bit to tell TOMOYO we're in execve(): */
unsigned in_execve:1;
unsigned in_iowait:1;
#ifndef TIF_RESTORE_SIGMASK
unsigned restore_sigmask:1;
#endif
-#ifdef CONFIG_MEMCG
+#ifdef CONFIG_MEMCG_V1
unsigned in_user_fault:1;
#endif
#ifdef CONFIG_LRU_GEN
@@ -943,17 +1039,20 @@ struct task_struct {
/* Recursion prevention for eventfd_signal() */
unsigned in_eventfd:1;
#endif
-#ifdef CONFIG_IOMMU_SVA
+#ifdef CONFIG_ARCH_HAS_CPU_PASID
unsigned pasid_activated:1;
#endif
-#ifdef CONFIG_CPU_SUP_INTEL
+#ifdef CONFIG_X86_BUS_LOCK_DETECT
unsigned reported_split_lock:1;
#endif
#ifdef CONFIG_TASK_DELAY_ACCT
/* delay due to memory thrashing */
unsigned in_thrashing:1;
#endif
-
+ unsigned in_nf_duplicate:1;
+#ifdef CONFIG_PREEMPT_RT
+ struct netdev_xmit net_xmit;
+#endif
unsigned long atomic_flags; /* Flags requiring atomic access. */
struct restart_block restart_block;
@@ -996,7 +1095,6 @@ struct task_struct {
/* PID/PID hash table linkage. */
struct pid *thread_pid;
struct hlist_node pid_links[PIDTYPE_MAX];
- struct list_head thread_group;
struct list_head thread_node;
struct completion *vfork_done;
@@ -1065,9 +1163,12 @@ struct task_struct {
/*
* executable name, excluding path.
*
- * - normally initialized setup_new_exec()
- * - access it with [gs]et_task_comm()
- * - lock it with task_lock()
+ * - normally initialized begin_new_exec()
+ * - set it with set_task_comm()
+ * - strscpy_pad() to ensure it is always NUL-terminated and
+ * zero-padded
+ * - task_lock() to ensure the operation is atomic and the name is
+ * fully updated.
*/
char comm[TASK_COMM_LEN];
@@ -1139,9 +1240,14 @@ struct task_struct {
struct rt_mutex_waiter *pi_blocked_on;
#endif
-#ifdef CONFIG_DEBUG_MUTEXES
- /* Mutex deadlock detection: */
- struct mutex_waiter *blocked_on;
+ struct mutex *blocked_on; /* lock we're blocked on */
+
+#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
+ /*
+ * Encoded lock address causing task block (lower 2 bits = type from
+ * <linux/hung_task.h>). Accessed via hung_task_*() helpers.
+ */
+ unsigned long blocker;
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
@@ -1184,8 +1290,6 @@ struct task_struct {
/* VM state: */
struct reclaim_state *reclaim_state;
- struct backing_dev_info *backing_dev_info;
-
struct io_context *io_context;
#ifdef CONFIG_COMPACTION
@@ -1214,14 +1318,16 @@ struct task_struct {
/* Sequence number to catch updates: */
seqcount_spinlock_t mems_allowed_seq;
int cpuset_mem_spread_rotor;
- int cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
/* Control Group info protected by css_set_lock: */
struct css_set __rcu *cgroups;
/* cg_list protected by css_set_lock and tsk->alloc_lock: */
struct list_head cg_list;
-#endif
+#ifdef CONFIG_PREEMPT_RT
+ struct llist_node cg_dead_lnode;
+#endif /* CONFIG_PREEMPT_RT */
+#endif /* CONFIG_CGROUPS */
#ifdef CONFIG_X86_CPU_RESCTRL
u32 closid;
u32 rmid;
@@ -1237,9 +1343,11 @@ struct task_struct {
unsigned int futex_state;
#endif
#ifdef CONFIG_PERF_EVENTS
+ u8 perf_recursion[PERF_NR_CONTEXTS];
struct perf_event_context *perf_event_ctxp;
struct mutex perf_event_mutex;
struct list_head perf_event_list;
+ struct perf_ctx_data __rcu *perf_ctx_data;
#endif
#ifdef CONFIG_DEBUG_PREEMPT
unsigned long preempt_disable_ip;
@@ -1248,6 +1356,7 @@ struct task_struct {
/* Protected by alloc_lock: */
struct mempolicy *mempolicy;
short il_prev;
+ u8 il_weight;
short pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
@@ -1300,23 +1409,11 @@ struct task_struct {
unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */
-#ifdef CONFIG_RSEQ
- struct rseq __user *rseq;
- u32 rseq_sig;
- /*
- * RmW on rseq_event_mask must be performed atomically
- * with respect to preemption.
- */
- unsigned long rseq_event_mask;
-#endif
+ struct rseq_data rseq;
+ struct sched_mm_cid mm_cid;
struct tlbflush_unmap_batch tlb_ubc;
- union {
- refcount_t rcu_users;
- struct rcu_head rcu;
- };
-
/* Cache last used pipe for splice(): */
struct pipe_inode_info *splice_pipe;
@@ -1378,10 +1475,11 @@ struct task_struct {
int curr_ret_depth;
/* Stack of return addresses for return function tracing: */
- struct ftrace_ret_stack *ret_stack;
+ unsigned long *ret_stack;
/* Timestamp for last schedule: */
unsigned long long ftrace_timestamp;
+ unsigned long long ftrace_sleeptime;
/*
* Number of functions that haven't been traced
@@ -1423,20 +1521,23 @@ struct task_struct {
unsigned int kcov_softirq;
#endif
-#ifdef CONFIG_MEMCG
+#ifdef CONFIG_MEMCG_V1
struct mem_cgroup *memcg_in_oom;
- gfp_t memcg_oom_gfp_mask;
- int memcg_oom_order;
+#endif
+#ifdef CONFIG_MEMCG
/* Number of pages to reclaim on returning to userland: */
unsigned int memcg_nr_pages_over_high;
/* Used by memcontrol for targeted memcg charge: */
struct mem_cgroup *active_memcg;
+
+ /* Cache for current->cgroups->memcg->objcg lookups: */
+ struct obj_cgroup *objcg;
#endif
#ifdef CONFIG_BLK_CGROUP
- struct request_queue *throttle_queue;
+ struct gendisk *throttle_disk;
#endif
#ifdef CONFIG_UPROBES
@@ -1453,6 +1554,8 @@ struct task_struct {
unsigned long saved_state_change;
# endif
#endif
+ struct rcu_head rcu;
+ refcount_t rcu_users;
int pagefault_disabled;
#ifdef CONFIG_MMU
struct task_struct *oom_reaper_list;
@@ -1478,9 +1581,13 @@ struct task_struct {
/* Used for BPF run context */
struct bpf_run_ctx *bpf_ctx;
#endif
+ /* Used by BPF for per-TASK xdp storage */
+ struct bpf_net_context *bpf_net_context;
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
unsigned long lowest_stack;
+#endif
+#ifdef CONFIG_KSTACK_ERASE_METRICS
unsigned long prev_lowest_stack;
#endif
@@ -1514,138 +1621,42 @@ struct task_struct {
#ifdef CONFIG_RV
/*
- * Per-task RV monitor. Nowadays fixed in RV_PER_TASK_MONITORS.
- * If we find justification for more monitors, we can think
- * about adding more or developing a dynamic method. So far,
- * none of these are justified.
+ * Per-task RV monitor, fixed in CONFIG_RV_PER_TASK_MONITORS.
+ * If memory becomes a concern, we can think about a dynamic method.
*/
- union rv_task_monitor rv[RV_PER_TASK_MONITORS];
+ union rv_task_monitor rv[CONFIG_RV_PER_TASK_MONITORS];
#endif
- /*
- * New fields for task_struct should be added above here, so that
- * they are included in the randomized portion of task_struct.
- */
- randomized_struct_fields_end
+#ifdef CONFIG_USER_EVENTS
+ struct user_event_mm *user_event_mm;
+#endif
+
+#ifdef CONFIG_UNWIND_USER
+ struct unwind_task_info unwind_info;
+#endif
/* CPU-specific state of this task: */
struct thread_struct thread;
/*
- * WARNING: on x86, 'thread_struct' contains a variable-sized
- * structure. It *MUST* be at the end of 'task_struct'.
- *
- * Do not put anything below here!
+ * New fields for task_struct should be added above here, so that
+ * they are included in the randomized portion of task_struct.
*/
-};
-
-static inline struct pid *task_pid(struct task_struct *task)
-{
- return task->thread_pid;
-}
-
-/*
- * the helpers to get the task's different pids as they are seen
- * from various namespaces
- *
- * task_xid_nr() : global id, i.e. the id seen from the init namespace;
- * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of
- * current.
- * task_xid_nr_ns() : id seen from the ns specified;
- *
- * see also pid_nr() etc in include/linux/pid.h
- */
-pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);
-
-static inline pid_t task_pid_nr(struct task_struct *tsk)
-{
- return tsk->pid;
-}
-
-static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
- return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
-}
-
-static inline pid_t task_pid_vnr(struct task_struct *tsk)
-{
- return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
-}
-
-
-static inline pid_t task_tgid_nr(struct task_struct *tsk)
-{
- return tsk->tgid;
-}
-
-/**
- * pid_alive - check that a task structure is not stale
- * @p: Task structure to be checked.
- *
- * Test if a process is not yet dead (at most zombie state)
- * If pid_alive fails, then pointers within the task structure
- * can be stale and must not be dereferenced.
- *
- * Return: 1 if the process is alive. 0 otherwise.
- */
-static inline int pid_alive(const struct task_struct *p)
-{
- return p->thread_pid != NULL;
-}
-
-static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
- return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
-}
-
-static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
-{
- return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
-}
-
-
-static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
- return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
-}
-
-static inline pid_t task_session_vnr(struct task_struct *tsk)
-{
- return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
-}
-
-static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
- return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns);
-}
-
-static inline pid_t task_tgid_vnr(struct task_struct *tsk)
-{
- return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL);
-}
-
-static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
-{
- pid_t pid = 0;
-
- rcu_read_lock();
- if (pid_alive(tsk))
- pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
- rcu_read_unlock();
-
- return pid;
-}
+ randomized_struct_fields_end
+} __attribute__ ((aligned (64)));
-static inline pid_t task_ppid_nr(const struct task_struct *tsk)
+#ifdef CONFIG_SCHED_PROXY_EXEC
+DECLARE_STATIC_KEY_TRUE(__sched_proxy_exec);
+static inline bool sched_proxy_exec(void)
{
- return task_ppid_nr_ns(tsk, &init_pid_ns);
+ return static_branch_likely(&__sched_proxy_exec);
}
-
-/* Obsolete, do not use: */
-static inline pid_t task_pgrp_nr(struct task_struct *tsk)
+#else
+static inline bool sched_proxy_exec(void)
{
- return task_pgrp_nr_ns(tsk, &init_pid_ns);
+ return false;
}
+#endif
#define TASK_REPORT_IDLE (TASK_REPORT + 1)
#define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1)
@@ -1657,15 +1668,16 @@ static inline unsigned int __task_state_index(unsigned int tsk_state,
BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX);
- if (tsk_state == TASK_IDLE)
+ if ((tsk_state & TASK_IDLE) == TASK_IDLE)
state = TASK_REPORT_IDLE;
/*
* We're lying here, but rather than expose a completely new task state
* to userspace, we can make this appear as if the task has gone through
* a regular rt_mutex_lock() call.
+ * Report frozen tasks as uninterruptible.
*/
- if (tsk_state == TASK_RTLOCK_WAIT)
+ if ((tsk_state & TASK_RTLOCK_WAIT) || (tsk_state & TASK_FROZEN))
state = TASK_UNINTERRUPTIBLE;
return fls(state);
@@ -1680,7 +1692,7 @@ static inline char task_index_to_char(unsigned int state)
{
static const char state_char[] = "RSDTtXZPI";
- BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1);
+ BUILD_BUG_ON(TASK_REPORT_MAX * 2 != 1 << (sizeof(state_char) - 1));
return state_char[state];
}
@@ -1690,20 +1702,6 @@ static inline char task_state_to_char(struct task_struct *tsk)
return task_index_to_char(task_state_index(tsk));
}
-/**
- * is_global_init - check if a task structure is init. Since init
- * is free to have sub-threads we need to check tgid.
- * @tsk: Task structure to be checked.
- *
- * Check if a task structure is the first user space task the kernel created.
- *
- * Return: 1 if the task structure is init. 0 otherwise.
- */
-static inline int is_global_init(struct task_struct *tsk)
-{
- return task_tgid_nr(tsk) == 1;
-}
-
extern struct pid *cad_pid;
/*
@@ -1720,15 +1718,15 @@ extern struct pid *cad_pid;
#define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */
#define PF_DUMPCORE 0x00000200 /* Dumped core */
#define PF_SIGNALED 0x00000400 /* Killed by a signal */
-#define PF_MEMALLOC 0x00000800 /* Allocating memory */
+#define PF_MEMALLOC 0x00000800 /* Allocating memory to free memory. See memalloc_noreclaim_save() */
#define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */
#define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */
-#define PF__HOLE__00004000 0x00004000
+#define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */
#define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */
-#define PF__HOLE__00010000 0x00010000
+#define PF_KCOMPACTD 0x00010000 /* I am kcompactd */
#define PF_KSWAPD 0x00020000 /* I am kswapd */
-#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */
-#define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */
+#define PF_MEMALLOC_NOFS 0x00040000 /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */
+#define PF_MEMALLOC_NOIO 0x00080000 /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */
#define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to,
* I am cleaning dirty pages from some other bdi. */
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
@@ -1738,8 +1736,9 @@ extern struct pid *cad_pid;
#define PF__HOLE__02000000 0x02000000
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
-#define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */
-#define PF__HOLE__20000000 0x20000000
+#define PF_MEMALLOC_PIN 0x10000000 /* Allocations constrained to zones which allow long term pinning.
+ * See memalloc_pin_save() */
+#define PF_BLOCK_TS 0x20000000 /* plug has ts that needs updating */
#define PF__HOLE__40000000 0x40000000
#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
@@ -1773,12 +1772,8 @@ extern struct pid *cad_pid;
static __always_inline bool is_percpu_thread(void)
{
-#ifdef CONFIG_SMP
return (current->flags & PF_NO_SETAFFINITY) &&
(current->nr_cpus_allowed == 1);
-#else
- return true;
-#endif
}
/* Per-process atomic flags. */
@@ -1840,41 +1835,26 @@ current_restore_flags(unsigned long orig_flags, unsigned long flags)
}
extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
-extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_effective_cpus);
-#ifdef CONFIG_SMP
-extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
+extern int task_can_attach(struct task_struct *p);
+extern int dl_bw_alloc(int cpu, u64 dl_bw);
+extern void dl_bw_free(int cpu, u64 dl_bw);
+
+/* set_cpus_allowed_force() - consider using set_cpus_allowed_ptr() instead */
+extern void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask);
+
+/**
+ * set_cpus_allowed_ptr - set CPU affinity mask of a task
+ * @p: the task
+ * @new_mask: CPU affinity mask
+ *
+ * Return: zero if successful, or a negative error code
+ */
extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node);
extern void release_user_cpus_ptr(struct task_struct *p);
extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask);
extern void force_compatible_cpus_allowed_ptr(struct task_struct *p);
extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p);
-#else
-static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
-{
-}
-static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
-{
- if (!cpumask_test_cpu(0, new_mask))
- return -EINVAL;
- return 0;
-}
-static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node)
-{
- if (src->user_cpus_ptr)
- return -EINVAL;
- return 0;
-}
-static inline void release_user_cpus_ptr(struct task_struct *p)
-{
- WARN_ON(p->user_cpus_ptr);
-}
-
-static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
-{
- return 0;
-}
-#endif
extern int yield_to(struct task_struct *p, bool preempt);
extern void set_user_nice(struct task_struct *p, long nice);
@@ -1899,6 +1879,7 @@ extern int sched_setscheduler(struct task_struct *, int, const struct sched_para
extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
extern void sched_set_fifo(struct task_struct *p);
extern void sched_set_fifo_low(struct task_struct *p);
+extern void sched_set_fifo_secondary(struct task_struct *p);
extern void sched_set_normal(struct task_struct *p, int nice);
extern int sched_setattr(struct task_struct *, const struct sched_attr *);
extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *);
@@ -1921,9 +1902,7 @@ extern void ia64_set_curr_task(int cpu, struct task_struct *p);
void yield(void);
union thread_union {
-#ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK
struct task_struct task;
-#endif
#ifndef CONFIG_THREAD_INFO_IN_TASK
struct thread_info thread_info;
#endif
@@ -1938,7 +1917,7 @@ extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)];
#ifdef CONFIG_THREAD_INFO_IN_TASK
# define task_thread_info(task) (&(task)->thread_info)
-#elif !defined(__HAVE_THREAD_FUNCTIONS)
+#else
# define task_thread_info(task) ((struct thread_info *)(task)->stack)
#endif
@@ -1965,26 +1944,33 @@ extern int wake_up_state(struct task_struct *tsk, unsigned int state);
extern int wake_up_process(struct task_struct *tsk);
extern void wake_up_new_task(struct task_struct *tsk);
-#ifdef CONFIG_SMP
extern void kick_process(struct task_struct *tsk);
-#else
-static inline void kick_process(struct task_struct *tsk) { }
-#endif
extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);
+#define set_task_comm(tsk, from) ({ \
+ BUILD_BUG_ON(sizeof(from) != TASK_COMM_LEN); \
+ __set_task_comm(tsk, from, false); \
+})
-static inline void set_task_comm(struct task_struct *tsk, const char *from)
-{
- __set_task_comm(tsk, from, false);
-}
-
-extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk);
+/*
+ * - Why not use task_lock()?
+ * User space can randomly change their names anyway, so locking for readers
+ * doesn't make sense. For writers, locking is probably necessary, as a race
+ * condition could lead to long-term mixed results.
+ * The strscpy_pad() in __set_task_comm() can ensure that the task comm is
+ * always NUL-terminated and zero-padded. Therefore the race condition between
+ * reader and writer is not an issue.
+ *
+ * - BUILD_BUG_ON() can help prevent the buf from being truncated.
+ * Since the callers don't perform any return value checks, this safeguard is
+ * necessary.
+ */
#define get_task_comm(buf, tsk) ({ \
- BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN); \
- __get_task_comm(buf, sizeof(buf), tsk); \
+ BUILD_BUG_ON(sizeof(buf) < TASK_COMM_LEN); \
+ strscpy_pad(buf, (tsk)->comm); \
+ buf; \
})
-#ifdef CONFIG_SMP
static __always_inline void scheduler_ipi(void)
{
/*
@@ -1994,14 +1980,8 @@ static __always_inline void scheduler_ipi(void)
*/
preempt_fold_need_resched();
}
+
extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state);
-#else
-static inline void scheduler_ipi(void) { }
-static inline unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
-{
- return 1;
-}
-#endif
/*
* Set thread flags in other task's structures.
@@ -2040,12 +2020,16 @@ static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
static inline void set_tsk_need_resched(struct task_struct *tsk)
{
+ if (tracepoint_enabled(sched_set_need_resched_tp) &&
+ !test_tsk_thread_flag(tsk, TIF_NEED_RESCHED))
+ __trace_set_need_resched(tsk, TIF_NEED_RESCHED);
set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}
static inline void clear_tsk_need_resched(struct task_struct *tsk)
{
- clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
+ atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY,
+ (atomic_long_t *)&task_thread_info(tsk)->flags);
}
static inline int test_tsk_need_resched(struct task_struct *tsk)
@@ -2053,6 +2037,13 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
}
+static inline void set_need_resched_current(void)
+{
+ lockdep_assert_irqs_disabled();
+ set_tsk_need_resched(current);
+ set_preempt_need_resched();
+}
+
/*
* cond_resched() and cond_resched_lock(): latency reduction via
* explicit rescheduling in places that are safe. The return
@@ -2072,6 +2063,7 @@ static __always_inline int _cond_resched(void)
}
#elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+
extern int dynamic_cond_resched(void);
static __always_inline int _cond_resched(void)
@@ -2079,20 +2071,23 @@ static __always_inline int _cond_resched(void)
return dynamic_cond_resched();
}
-#else
+#else /* !CONFIG_PREEMPTION */
static inline int _cond_resched(void)
{
return __cond_resched();
}
-#endif /* CONFIG_PREEMPT_DYNAMIC */
+#endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
-#else
+#else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */
-static inline int _cond_resched(void) { return 0; }
+static inline int _cond_resched(void)
+{
+ return 0;
+}
-#endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */
+#endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */
#define cond_resched() ({ \
__might_resched(__FILE__, __LINE__, 0); \
@@ -2137,86 +2132,71 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock);
__cond_resched_rwlock_write(lock); \
})
-static inline void cond_resched_rcu(void)
+#ifndef CONFIG_PREEMPT_RT
+static inline struct mutex *__get_task_blocked_on(struct task_struct *p)
{
-#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
- rcu_read_unlock();
- cond_resched();
- rcu_read_lock();
-#endif
-}
-
-#ifdef CONFIG_PREEMPT_DYNAMIC
+ struct mutex *m = p->blocked_on;
-extern bool preempt_model_none(void);
-extern bool preempt_model_voluntary(void);
-extern bool preempt_model_full(void);
-
-#else
-
-static inline bool preempt_model_none(void)
-{
- return IS_ENABLED(CONFIG_PREEMPT_NONE);
-}
-static inline bool preempt_model_voluntary(void)
-{
- return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY);
+ if (m)
+ lockdep_assert_held_once(&m->wait_lock);
+ return m;
}
-static inline bool preempt_model_full(void)
+
+static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m)
{
- return IS_ENABLED(CONFIG_PREEMPT);
-}
+ struct mutex *blocked_on = READ_ONCE(p->blocked_on);
-#endif
+ WARN_ON_ONCE(!m);
+ /* The task should only be setting itself as blocked */
+ WARN_ON_ONCE(p != current);
+ /* Currently we serialize blocked_on under the mutex::wait_lock */
+ lockdep_assert_held_once(&m->wait_lock);
+ /*
+ * Check ensure we don't overwrite existing mutex value
+ * with a different mutex. Note, setting it to the same
+ * lock repeatedly is ok.
+ */
+ WARN_ON_ONCE(blocked_on && blocked_on != m);
+ WRITE_ONCE(p->blocked_on, m);
+}
-static inline bool preempt_model_rt(void)
+static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m)
{
- return IS_ENABLED(CONFIG_PREEMPT_RT);
+ guard(raw_spinlock_irqsave)(&m->wait_lock);
+ __set_task_blocked_on(p, m);
}
-/*
- * Does the preemption model allow non-cooperative preemption?
- *
- * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with
- * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the
- * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the
- * PREEMPT_NONE model.
- */
-static inline bool preempt_model_preemptible(void)
+static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m)
{
- return preempt_model_full() || preempt_model_rt();
+ if (m) {
+ struct mutex *blocked_on = READ_ONCE(p->blocked_on);
+
+ /* Currently we serialize blocked_on under the mutex::wait_lock */
+ lockdep_assert_held_once(&m->wait_lock);
+ /*
+ * There may be cases where we re-clear already cleared
+ * blocked_on relationships, but make sure we are not
+ * clearing the relationship with a different lock.
+ */
+ WARN_ON_ONCE(blocked_on && blocked_on != m);
+ }
+ WRITE_ONCE(p->blocked_on, NULL);
}
-/*
- * Does a critical section need to be broken due to another
- * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
- * but a general need for low latency)
- */
-static inline int spin_needbreak(spinlock_t *lock)
+static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m)
{
-#ifdef CONFIG_PREEMPTION
- return spin_is_contended(lock);
+ guard(raw_spinlock_irqsave)(&m->wait_lock);
+ __clear_task_blocked_on(p, m);
+}
#else
- return 0;
-#endif
+static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
+{
}
-/*
- * Check if a rwlock is contended.
- * Returns non-zero if there is another task waiting on the rwlock.
- * Returns zero if the lock is not contended or the system / underlying
- * rwlock implementation does not support contention detection.
- * Technically does not depend on CONFIG_PREEMPTION, but a general need
- * for low latency.
- */
-static inline int rwlock_needbreak(rwlock_t *lock)
+static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
{
-#ifdef CONFIG_PREEMPTION
- return rwlock_is_contended(lock);
-#else
- return 0;
-#endif
}
+#endif /* !CONFIG_PREEMPT_RT */
static __always_inline bool need_resched(void)
{
@@ -2248,6 +2228,11 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
#endif /* CONFIG_SMP */
+static inline bool task_is_runnable(struct task_struct *p)
+{
+ return p->on_rq && !p->se.sched_delayed;
+}
+
extern bool sched_task_on_rq(struct task_struct *p);
extern unsigned long get_wchan(struct task_struct *p);
extern struct task_struct *cpu_curr_snapshot(int cpu);
@@ -2274,7 +2259,6 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
#define TASK_SIZE_OF(tsk) TASK_SIZE
#endif
-#ifdef CONFIG_SMP
static inline bool owner_on_cpu(struct task_struct *owner)
{
/*
@@ -2286,138 +2270,174 @@ static inline bool owner_on_cpu(struct task_struct *owner)
/* Returns effective CPU energy utilization, as seen by the scheduler */
unsigned long sched_cpu_util(int cpu);
-#endif /* CONFIG_SMP */
-
-#ifdef CONFIG_RSEQ
-/*
- * Map the event mask on the user-space ABI enum rseq_cs_flags
- * for direct mask checks.
- */
-enum rseq_event_mask_bits {
- RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
- RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
- RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
-};
+#ifdef CONFIG_SCHED_CORE
+extern void sched_core_free(struct task_struct *tsk);
+extern void sched_core_fork(struct task_struct *p);
+extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
+ unsigned long uaddr);
+extern int sched_core_idle_cpu(int cpu);
+#else
+static inline void sched_core_free(struct task_struct *tsk) { }
+static inline void sched_core_fork(struct task_struct *p) { }
+static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); }
+#endif
-enum rseq_event_mask {
- RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT),
- RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT),
- RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT),
-};
+extern void sched_set_stop_task(int cpu, struct task_struct *stop);
-static inline void rseq_set_notify_resume(struct task_struct *t)
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+static __always_inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag)
{
- if (t->rseq)
- set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+ swap(current->alloc_tag, tag);
+ return tag;
}
-void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
-
-static inline void rseq_handle_notify_resume(struct ksignal *ksig,
- struct pt_regs *regs)
+static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old)
{
- if (current->rseq)
- __rseq_handle_notify_resume(ksig, regs);
+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
+ WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n");
+#endif
+ current->alloc_tag = old;
}
+#else
+#define alloc_tag_save(_tag) NULL
+#define alloc_tag_restore(_tag, _old) do {} while (0)
+#endif
-static inline void rseq_signal_deliver(struct ksignal *ksig,
- struct pt_regs *regs)
+/* Avoids recursive inclusion hell */
+#ifdef CONFIG_SCHED_MM_CID
+void sched_mm_cid_before_execve(struct task_struct *t);
+void sched_mm_cid_after_execve(struct task_struct *t);
+void sched_mm_cid_fork(struct task_struct *t);
+void sched_mm_cid_exit(struct task_struct *t);
+static __always_inline int task_mm_cid(struct task_struct *t)
{
- preempt_disable();
- __set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
- preempt_enable();
- rseq_handle_notify_resume(ksig, regs);
+ return t->mm_cid.cid & ~(MM_CID_ONCPU | MM_CID_TRANSIT);
}
-
-/* rseq_preempt() requires preemption to be disabled. */
-static inline void rseq_preempt(struct task_struct *t)
+#else
+static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
+static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
+static inline void sched_mm_cid_fork(struct task_struct *t) { }
+static inline void sched_mm_cid_exit(struct task_struct *t) { }
+static __always_inline int task_mm_cid(struct task_struct *t)
{
- __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
- rseq_set_notify_resume(t);
+ /*
+ * Use the processor id as a fall-back when the mm cid feature is
+ * disabled. This provides functional per-cpu data structure accesses
+ * in user-space, althrough it won't provide the memory usage benefits.
+ */
+ return task_cpu(t);
}
+#endif
-/* rseq_migrate() requires preemption to be disabled. */
-static inline void rseq_migrate(struct task_struct *t)
-{
- __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
- rseq_set_notify_resume(t);
-}
+#ifndef MODULE
+#ifndef COMPILE_OFFSETS
+
+extern void ___migrate_enable(void);
+
+struct rq;
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
/*
- * If parent process has a registered restartable sequences area, the
- * child inherits. Unregister rseq for a clone with CLONE_VM set.
+ * The "struct rq" is not available here, so we can't access the
+ * "runqueues" with this_cpu_ptr(), as the compilation will fail in
+ * this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr():
+ * typeof((ptr) + 0)
+ *
+ * So use arch_raw_cpu_ptr()/PERCPU_PTR() directly here.
*/
-static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
-{
- if (clone_flags & CLONE_VM) {
- t->rseq = NULL;
- t->rseq_sig = 0;
- t->rseq_event_mask = 0;
- } else {
- t->rseq = current->rseq;
- t->rseq_sig = current->rseq_sig;
- t->rseq_event_mask = current->rseq_event_mask;
- }
-}
+#ifdef CONFIG_SMP
+#define this_rq_raw() arch_raw_cpu_ptr(&runqueues)
+#else
+#define this_rq_raw() PERCPU_PTR(&runqueues)
+#endif
+#define this_rq_pinned() (*(unsigned int *)((void *)this_rq_raw() + RQ_nr_pinned))
-static inline void rseq_execve(struct task_struct *t)
+static inline void __migrate_enable(void)
{
- t->rseq = NULL;
- t->rseq_sig = 0;
- t->rseq_event_mask = 0;
-}
+ struct task_struct *p = current;
-#else
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Check both overflow from migrate_disable() and superfluous
+ * migrate_enable().
+ */
+ if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
+ return;
+#endif
-static inline void rseq_set_notify_resume(struct task_struct *t)
-{
-}
-static inline void rseq_handle_notify_resume(struct ksignal *ksig,
- struct pt_regs *regs)
-{
-}
-static inline void rseq_signal_deliver(struct ksignal *ksig,
- struct pt_regs *regs)
-{
-}
-static inline void rseq_preempt(struct task_struct *t)
-{
-}
-static inline void rseq_migrate(struct task_struct *t)
-{
-}
-static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
-{
+ if (p->migration_disabled > 1) {
+ p->migration_disabled--;
+ return;
+ }
+
+ /*
+ * Ensure stop_task runs either before or after this, and that
+ * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
+ */
+ guard(preempt)();
+ if (unlikely(p->cpus_ptr != &p->cpus_mask))
+ ___migrate_enable();
+ /*
+ * Mustn't clear migration_disabled() until cpus_ptr points back at the
+ * regular cpus_mask, otherwise things that race (eg.
+ * select_fallback_rq) get confused.
+ */
+ barrier();
+ p->migration_disabled = 0;
+ this_rq_pinned()--;
}
-static inline void rseq_execve(struct task_struct *t)
+
+static inline void __migrate_disable(void)
{
-}
+ struct task_struct *p = current;
+ if (p->migration_disabled) {
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ *Warn about overflow half-way through the range.
+ */
+ WARN_ON_ONCE((s16)p->migration_disabled < 0);
#endif
+ p->migration_disabled++;
+ return;
+ }
-#ifdef CONFIG_DEBUG_RSEQ
-
-void rseq_syscall(struct pt_regs *regs);
+ guard(preempt)();
+ this_rq_pinned()++;
+ p->migration_disabled = 1;
+}
+#else /* !COMPILE_OFFSETS */
+static inline void __migrate_disable(void) { }
+static inline void __migrate_enable(void) { }
+#endif /* !COMPILE_OFFSETS */
-#else
+/*
+ * So that it is possible to not export the runqueues variable, define and
+ * export migrate_enable/migrate_disable in kernel/sched/core.c too, and use
+ * them for the modules. The macro "INSTANTIATE_EXPORTED_MIGRATE_DISABLE" will
+ * be defined in kernel/sched/core.c.
+ */
+#ifndef INSTANTIATE_EXPORTED_MIGRATE_DISABLE
+static __always_inline void migrate_disable(void)
+{
+ __migrate_disable();
+}
-static inline void rseq_syscall(struct pt_regs *regs)
+static __always_inline void migrate_enable(void)
{
+ __migrate_enable();
}
+#else /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */
+extern void migrate_disable(void);
+extern void migrate_enable(void);
+#endif /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */
-#endif
+#else /* MODULE */
+extern void migrate_disable(void);
+extern void migrate_enable(void);
+#endif /* MODULE */
-#ifdef CONFIG_SCHED_CORE
-extern void sched_core_free(struct task_struct *tsk);
-extern void sched_core_fork(struct task_struct *p);
-extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
- unsigned long uaddr);
-#else
-static inline void sched_core_free(struct task_struct *tsk) { }
-static inline void sched_core_fork(struct task_struct *p) { }
-#endif
-
-extern void sched_set_stop_task(int cpu, struct task_struct *stop);
+DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
#endif