summaryrefslogtreecommitdiff
path: root/include/linux/sched.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/sched.h')
-rw-r--r--include/linux/sched.h550
1 files changed, 387 insertions, 163 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d07d3645d2d5..d395f2810fac 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -13,7 +13,7 @@
#include <asm/processor.h>
#include <linux/thread_info.h>
#include <linux/preempt.h>
-#include <linux/cpumask.h>
+#include <linux/cpumask_types.h>
#include <linux/cache.h>
#include <linux/irqflags_types.h>
@@ -34,19 +34,24 @@
#include <linux/sched/prio.h>
#include <linux/sched/types.h>
#include <linux/signal_types.h>
+#include <linux/spinlock.h>
#include <linux/syscall_user_dispatch_types.h>
#include <linux/mm_types_task.h>
#include <linux/netdevice_xmit.h>
#include <linux/task_io_accounting.h>
#include <linux/posix-timers_types.h>
#include <linux/restart_block.h>
-#include <uapi/linux/rseq.h>
+#include <linux/rseq_types.h>
#include <linux/seqlock_types.h>
#include <linux/kcsan.h>
#include <linux/rv.h>
-#include <linux/livepatch_sched.h>
#include <linux/uidgid_types.h>
+#include <linux/tracepoint-defs.h>
+#include <linux/unwind_deferred_types.h>
#include <asm/kmap_size.h>
+#ifndef COMPILE_OFFSETS
+#include <generated/rq-offsets.h>
+#endif
/* task_struct member predeclarations (sorted alphabetically): */
struct audit_context;
@@ -65,6 +70,7 @@ struct mempolicy;
struct nameidata;
struct nsproxy;
struct perf_event_context;
+struct perf_ctx_data;
struct pid_namespace;
struct pipe_inode_info;
struct rcu_node;
@@ -82,6 +88,8 @@ struct task_group;
struct task_struct;
struct user_event_mm;
+#include <linux/sched/ext.h>
+
/*
* Task state bitmask. NOTE! These bits are also
* encoded in fs/proc/array.c: get_task_state().
@@ -149,8 +157,9 @@ struct user_event_mm;
* Special states are those that do not use the normal wait-loop pattern. See
* the comment with set_special_state().
*/
-#define is_special_task_state(state) \
- ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))
+#define is_special_task_state(state) \
+ ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | \
+ TASK_DEAD | TASK_FROZEN))
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
# define debug_normal_state_change(state_value) \
@@ -183,6 +192,12 @@ struct user_event_mm;
# define debug_rtlock_wait_restore_state() do { } while (0)
#endif
+#define trace_set_current_state(state_value) \
+ do { \
+ if (tracepoint_enabled(sched_set_state_tp)) \
+ __trace_set_current_state(state_value); \
+ } while (0)
+
/*
* set_current_state() includes a barrier so that the write of current->__state
* is correctly serialised wrt the caller's subsequent test of whether to
@@ -223,12 +238,14 @@ struct user_event_mm;
#define __set_current_state(state_value) \
do { \
debug_normal_state_change((state_value)); \
+ trace_set_current_state(state_value); \
WRITE_ONCE(current->__state, (state_value)); \
} while (0)
#define set_current_state(state_value) \
do { \
debug_normal_state_change((state_value)); \
+ trace_set_current_state(state_value); \
smp_store_mb(current->__state, (state_value)); \
} while (0)
@@ -244,6 +261,7 @@ struct user_event_mm;
\
raw_spin_lock_irqsave(&current->pi_lock, flags); \
debug_special_state_change((state_value)); \
+ trace_set_current_state(state_value); \
WRITE_ONCE(current->__state, (state_value)); \
raw_spin_unlock_irqrestore(&current->pi_lock, flags); \
} while (0)
@@ -279,6 +297,7 @@ struct user_event_mm;
raw_spin_lock(&current->pi_lock); \
current->saved_state = current->__state; \
debug_rtlock_wait_set_state(); \
+ trace_set_current_state(TASK_RTLOCK_WAIT); \
WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \
raw_spin_unlock(&current->pi_lock); \
} while (0);
@@ -288,6 +307,7 @@ struct user_event_mm;
lockdep_assert_irqs_disabled(); \
raw_spin_lock(&current->pi_lock); \
debug_rtlock_wait_restore_state(); \
+ trace_set_current_state(current->saved_state); \
WRITE_ONCE(current->__state, current->saved_state); \
current->saved_state = TASK_RUNNING; \
raw_spin_unlock(&current->pi_lock); \
@@ -324,6 +344,12 @@ extern void io_schedule_finish(int token);
extern long io_schedule_timeout(long timeout);
extern void io_schedule(void);
+/* wrapper functions to trace from this header file */
+DECLARE_TRACEPOINT(sched_set_state_tp);
+extern void __trace_set_current_state(int state_value);
+DECLARE_TRACEPOINT(sched_set_need_resched_tp);
+extern void __trace_set_need_resched(struct task_struct *curr, int tif);
+
/**
* struct prev_cputime - snapshot of system and user cputime
* @utime: time spent in user mode
@@ -376,10 +402,10 @@ enum uclamp_id {
UCLAMP_CNT
};
-#ifdef CONFIG_SMP
extern struct root_domain def_root_domain;
extern struct mutex sched_domains_mutex;
-#endif
+extern void sched_domains_mutex_lock(void);
+extern void sched_domains_mutex_unlock(void);
struct sched_param {
int sched_priority;
@@ -395,6 +421,12 @@ struct sched_info {
/* Time spent waiting on a runqueue: */
unsigned long long run_delay;
+ /* Max time spent waiting on a runqueue: */
+ unsigned long long max_run_delay;
+
+ /* Min time spent waiting on a runqueue: */
+ unsigned long long min_run_delay;
+
/* Timestamps: */
/* When did we last run on a CPU? */
@@ -541,15 +573,28 @@ struct sched_entity {
struct rb_node run_node;
u64 deadline;
u64 min_vruntime;
+ u64 min_slice;
struct list_head group_node;
- unsigned int on_rq;
+ unsigned char on_rq;
+ unsigned char sched_delayed;
+ unsigned char rel_deadline;
+ unsigned char custom_slice;
+ /* hole */
u64 exec_start;
u64 sum_exec_runtime;
u64 prev_sum_exec_runtime;
u64 vruntime;
- s64 vlag;
+ union {
+ /*
+ * When !@on_rq this field is vlag.
+ * When cfs_rq->curr == se (which implies @on_rq)
+ * this field is vprot. See protect_slice().
+ */
+ s64 vlag;
+ u64 vprot;
+ };
u64 slice;
u64 nr_migrations;
@@ -565,7 +610,6 @@ struct sched_entity {
unsigned long runnable_weight;
#endif
-#ifdef CONFIG_SMP
/*
* Per entity load average tracking.
*
@@ -573,7 +617,6 @@ struct sched_entity {
* collide with read-mostly values above.
*/
struct sched_avg avg;
-#endif
};
struct sched_rt_entity {
@@ -594,8 +637,8 @@ struct sched_rt_entity {
#endif
} __randomize_layout;
-typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
-typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);
+struct rq_flags;
+typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *, struct rq_flags *rf);
struct sched_dl_entity {
struct rb_node rb_node;
@@ -639,12 +682,36 @@ struct sched_dl_entity {
*
* @dl_overrun tells if the task asked to be informed about runtime
* overruns.
+ *
+ * @dl_server tells if this is a server entity.
+ *
+ * @dl_server_active tells if the dlserver is active(started).
+ * dlserver is started on first cfs enqueue on an idle runqueue
+ * and is stopped when a dequeue results in 0 cfs tasks on the
+ * runqueue. In other words, dlserver is active only when cpu's
+ * runqueue has atleast one cfs task.
+ *
+ * @dl_defer tells if this is a deferred or regular server. For
+ * now only defer server exists.
+ *
+ * @dl_defer_armed tells if the deferrable server is waiting
+ * for the replenishment timer to activate it.
+ *
+ * @dl_defer_running tells if the deferrable server is actually
+ * running, skipping the defer phase.
+ *
+ * @dl_defer_idle tracks idle state
*/
unsigned int dl_throttled : 1;
unsigned int dl_yielded : 1;
unsigned int dl_non_contending : 1;
unsigned int dl_overrun : 1;
unsigned int dl_server : 1;
+ unsigned int dl_server_active : 1;
+ unsigned int dl_defer : 1;
+ unsigned int dl_defer_armed : 1;
+ unsigned int dl_defer_running : 1;
+ unsigned int dl_defer_idle : 1;
/*
* Bandwidth enforcement timer. Each -deadline task has its
@@ -666,13 +733,9 @@ struct sched_dl_entity {
* dl_server_update().
*
* @rq the runqueue this server is for
- *
- * @server_has_tasks() returns true if @server_pick return a
- * runnable task.
*/
struct rq *rq;
- dl_server_has_tasks_f server_has_tasks;
- dl_server_pick_f server_pick;
+ dl_server_pick_f server_pick_task;
#ifdef CONFIG_RT_MUTEXES
/*
@@ -736,6 +799,12 @@ enum perf_event_task_context {
perf_nr_task_contexts,
};
+/*
+ * Number of contexts where an event can trigger:
+ * task, softirq, hardirq, nmi.
+ */
+#define PERF_NR_CONTEXTS 4
+
struct wake_q_node {
struct wake_q_node *next;
};
@@ -776,7 +845,6 @@ struct task_struct {
struct alloc_tag *alloc_tag;
#endif
-#ifdef CONFIG_SMP
int on_cpu;
struct __call_single_node wake_entry;
unsigned int wakee_flips;
@@ -792,7 +860,6 @@ struct task_struct {
*/
int recent_used_cpu;
int wake_cpu;
-#endif
int on_rq;
int prio;
@@ -804,6 +871,9 @@ struct task_struct {
struct sched_rt_entity rt;
struct sched_dl_entity dl;
struct sched_dl_entity *dl_server;
+#ifdef CONFIG_SCHED_CLASS_EXT
+ struct sched_ext_entity scx;
+#endif
const struct sched_class *sched_class;
#ifdef CONFIG_SCHED_CORE
@@ -814,6 +884,11 @@ struct task_struct {
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
+#ifdef CONFIG_CFS_BANDWIDTH
+ struct callback_head sched_throttle_work;
+ struct list_head throttle_node;
+ bool throttled;
+#endif
#endif
@@ -848,9 +923,7 @@ struct task_struct {
cpumask_t *user_cpus_ptr;
cpumask_t cpus_mask;
void *migration_pending;
-#ifdef CONFIG_SMP
unsigned short migration_disabled;
-#endif
unsigned short migration_flags;
#ifdef CONFIG_PREEMPT_RCU
@@ -882,10 +955,8 @@ struct task_struct {
struct sched_info sched_info;
struct list_head tasks;
-#ifdef CONFIG_SMP
struct plist_node pushable_tasks;
struct rb_node pushable_dl_tasks;
-#endif
struct mm_struct *mm;
struct mm_struct *active_mm;
@@ -906,6 +977,7 @@ struct task_struct {
unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;
unsigned sched_migrated:1;
+ unsigned sched_task_hot:1;
/* Force alignment to the next boundary: */
unsigned :0;
@@ -936,7 +1008,7 @@ struct task_struct {
#ifndef TIF_RESTORE_SIGMASK
unsigned restore_sigmask:1;
#endif
-#ifdef CONFIG_MEMCG
+#ifdef CONFIG_MEMCG_V1
unsigned in_user_fault:1;
#endif
#ifdef CONFIG_LRU_GEN
@@ -970,13 +1042,14 @@ struct task_struct {
#ifdef CONFIG_ARCH_HAS_CPU_PASID
unsigned pasid_activated:1;
#endif
-#ifdef CONFIG_CPU_SUP_INTEL
+#ifdef CONFIG_X86_BUS_LOCK_DETECT
unsigned reported_split_lock:1;
#endif
#ifdef CONFIG_TASK_DELAY_ACCT
/* delay due to memory thrashing */
unsigned in_thrashing:1;
#endif
+ unsigned in_nf_duplicate:1;
#ifdef CONFIG_PREEMPT_RT
struct netdev_xmit net_xmit;
#endif
@@ -1090,9 +1163,12 @@ struct task_struct {
/*
* executable name, excluding path.
*
- * - normally initialized setup_new_exec()
- * - access it with [gs]et_task_comm()
- * - lock it with task_lock()
+ * - normally initialized begin_new_exec()
+ * - set it with set_task_comm()
+ * - strscpy_pad() to ensure it is always NUL-terminated and
+ * zero-padded
+ * - task_lock() to ensure the operation is atomic and the name is
+ * fully updated.
*/
char comm[TASK_COMM_LEN];
@@ -1164,9 +1240,14 @@ struct task_struct {
struct rt_mutex_waiter *pi_blocked_on;
#endif
-#ifdef CONFIG_DEBUG_MUTEXES
- /* Mutex deadlock detection: */
- struct mutex_waiter *blocked_on;
+ struct mutex *blocked_on; /* lock we're blocked on */
+
+#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
+ /*
+ * Encoded lock address causing task block (lower 2 bits = type from
+ * <linux/hung_task.h>). Accessed via hung_task_*() helpers.
+ */
+ unsigned long blocker;
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
@@ -1237,14 +1318,16 @@ struct task_struct {
/* Sequence number to catch updates: */
seqcount_spinlock_t mems_allowed_seq;
int cpuset_mem_spread_rotor;
- int cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
/* Control Group info protected by css_set_lock: */
struct css_set __rcu *cgroups;
/* cg_list protected by css_set_lock and tsk->alloc_lock: */
struct list_head cg_list;
-#endif
+#ifdef CONFIG_PREEMPT_RT
+ struct llist_node cg_dead_lnode;
+#endif /* CONFIG_PREEMPT_RT */
+#endif /* CONFIG_CGROUPS */
#ifdef CONFIG_X86_CPU_RESCTRL
u32 closid;
u32 rmid;
@@ -1260,9 +1343,11 @@ struct task_struct {
unsigned int futex_state;
#endif
#ifdef CONFIG_PERF_EVENTS
+ u8 perf_recursion[PERF_NR_CONTEXTS];
struct perf_event_context *perf_event_ctxp;
struct mutex perf_event_mutex;
struct list_head perf_event_list;
+ struct perf_ctx_data __rcu *perf_ctx_data;
#endif
#ifdef CONFIG_DEBUG_PREEMPT
unsigned long preempt_disable_ip;
@@ -1324,24 +1409,8 @@ struct task_struct {
unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */
-#ifdef CONFIG_RSEQ
- struct rseq __user *rseq;
- u32 rseq_len;
- u32 rseq_sig;
- /*
- * RmW on rseq_event_mask must be performed atomically
- * with respect to preemption.
- */
- unsigned long rseq_event_mask;
-#endif
-
-#ifdef CONFIG_SCHED_MM_CID
- int mm_cid; /* Current cid in mm */
- int last_mm_cid; /* Most recent cid in mm */
- int migrate_from_cpu;
- int mm_cid_active; /* Whether cid bitmap is active */
- struct callback_head cid_work;
-#endif
+ struct rseq_data rseq;
+ struct sched_mm_cid mm_cid;
struct tlbflush_unmap_batch tlb_ubc;
@@ -1406,10 +1475,11 @@ struct task_struct {
int curr_ret_depth;
/* Stack of return addresses for return function tracing: */
- struct ftrace_ret_stack *ret_stack;
+ unsigned long *ret_stack;
/* Timestamp for last schedule: */
unsigned long long ftrace_timestamp;
+ unsigned long long ftrace_sleeptime;
/*
* Number of functions that haven't been traced
@@ -1451,17 +1521,18 @@ struct task_struct {
unsigned int kcov_softirq;
#endif
-#ifdef CONFIG_MEMCG
+#ifdef CONFIG_MEMCG_V1
struct mem_cgroup *memcg_in_oom;
+#endif
+#ifdef CONFIG_MEMCG
/* Number of pages to reclaim on returning to userland: */
unsigned int memcg_nr_pages_over_high;
/* Used by memcontrol for targeted memcg charge: */
struct mem_cgroup *active_memcg;
-#endif
-#ifdef CONFIG_MEMCG_KMEM
+ /* Cache for current->cgroups->memcg->objcg lookups: */
struct obj_cgroup *objcg;
#endif
@@ -1513,8 +1584,10 @@ struct task_struct {
/* Used by BPF for per-TASK xdp storage */
struct bpf_net_context *bpf_net_context;
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
unsigned long lowest_stack;
+#endif
+#ifdef CONFIG_KSTACK_ERASE_METRICS
unsigned long prev_lowest_stack;
#endif
@@ -1548,34 +1621,42 @@ struct task_struct {
#ifdef CONFIG_RV
/*
- * Per-task RV monitor. Nowadays fixed in RV_PER_TASK_MONITORS.
- * If we find justification for more monitors, we can think
- * about adding more or developing a dynamic method. So far,
- * none of these are justified.
+ * Per-task RV monitor, fixed in CONFIG_RV_PER_TASK_MONITORS.
+ * If memory becomes a concern, we can think about a dynamic method.
*/
- union rv_task_monitor rv[RV_PER_TASK_MONITORS];
+ union rv_task_monitor rv[CONFIG_RV_PER_TASK_MONITORS];
#endif
#ifdef CONFIG_USER_EVENTS
struct user_event_mm *user_event_mm;
#endif
- /*
- * New fields for task_struct should be added above here, so that
- * they are included in the randomized portion of task_struct.
- */
- randomized_struct_fields_end
+#ifdef CONFIG_UNWIND_USER
+ struct unwind_task_info unwind_info;
+#endif
/* CPU-specific state of this task: */
struct thread_struct thread;
/*
- * WARNING: on x86, 'thread_struct' contains a variable-sized
- * structure. It *MUST* be at the end of 'task_struct'.
- *
- * Do not put anything below here!
+ * New fields for task_struct should be added above here, so that
+ * they are included in the randomized portion of task_struct.
*/
-};
+ randomized_struct_fields_end
+} __attribute__ ((aligned (64)));
+
+#ifdef CONFIG_SCHED_PROXY_EXEC
+DECLARE_STATIC_KEY_TRUE(__sched_proxy_exec);
+static inline bool sched_proxy_exec(void)
+{
+ return static_branch_likely(&__sched_proxy_exec);
+}
+#else
+static inline bool sched_proxy_exec(void)
+{
+ return false;
+}
+#endif
#define TASK_REPORT_IDLE (TASK_REPORT + 1)
#define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1)
@@ -1594,8 +1675,9 @@ static inline unsigned int __task_state_index(unsigned int tsk_state,
* We're lying here, but rather than expose a completely new task state
* to userspace, we can make this appear as if the task has gone through
* a regular rt_mutex_lock() call.
+ * Report frozen tasks as uninterruptible.
*/
- if (tsk_state & TASK_RTLOCK_WAIT)
+ if ((tsk_state & TASK_RTLOCK_WAIT) || (tsk_state & TASK_FROZEN))
state = TASK_UNINTERRUPTIBLE;
return fls(state);
@@ -1610,7 +1692,7 @@ static inline char task_index_to_char(unsigned int state)
{
static const char state_char[] = "RSDTtXZPI";
- BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1);
+ BUILD_BUG_ON(TASK_REPORT_MAX * 2 != 1 << (sizeof(state_char) - 1));
return state_char[state];
}
@@ -1641,7 +1723,7 @@ extern struct pid *cad_pid;
#define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */
#define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */
#define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */
-#define PF__HOLE__00010000 0x00010000
+#define PF_KCOMPACTD 0x00010000 /* I am kcompactd */
#define PF_KSWAPD 0x00020000 /* I am kswapd */
#define PF_MEMALLOC_NOFS 0x00040000 /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */
#define PF_MEMALLOC_NOIO 0x00080000 /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */
@@ -1649,8 +1731,8 @@ extern struct pid *cad_pid;
* I am cleaning dirty pages from some other bdi. */
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
#define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
-#define PF_MEMALLOC_NORECLAIM 0x00800000 /* All allocation requests will clear __GFP_DIRECT_RECLAIM */
-#define PF_MEMALLOC_NOWARN 0x01000000 /* All allocation requests will inherit __GFP_NOWARN */
+#define PF__HOLE__00800000 0x00800000
+#define PF__HOLE__01000000 0x01000000
#define PF__HOLE__02000000 0x02000000
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
@@ -1690,12 +1772,8 @@ extern struct pid *cad_pid;
static __always_inline bool is_percpu_thread(void)
{
-#ifdef CONFIG_SMP
return (current->flags & PF_NO_SETAFFINITY) &&
(current->nr_cpus_allowed == 1);
-#else
- return true;
-#endif
}
/* Per-process atomic flags. */
@@ -1760,10 +1838,9 @@ extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpu
extern int task_can_attach(struct task_struct *p);
extern int dl_bw_alloc(int cpu, u64 dl_bw);
extern void dl_bw_free(int cpu, u64 dl_bw);
-#ifdef CONFIG_SMP
-/* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */
-extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
+/* set_cpus_allowed_force() - consider using set_cpus_allowed_ptr() instead */
+extern void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask);
/**
* set_cpus_allowed_ptr - set CPU affinity mask of a task
@@ -1778,32 +1855,6 @@ extern void release_user_cpus_ptr(struct task_struct *p);
extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask);
extern void force_compatible_cpus_allowed_ptr(struct task_struct *p);
extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p);
-#else
-static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
-{
-}
-static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
-{
- if (!cpumask_test_cpu(0, new_mask))
- return -EINVAL;
- return 0;
-}
-static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node)
-{
- if (src->user_cpus_ptr)
- return -EINVAL;
- return 0;
-}
-static inline void release_user_cpus_ptr(struct task_struct *p)
-{
- WARN_ON(p->user_cpus_ptr);
-}
-
-static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
-{
- return 0;
-}
-#endif
extern int yield_to(struct task_struct *p, bool preempt);
extern void set_user_nice(struct task_struct *p, long nice);
@@ -1828,6 +1879,7 @@ extern int sched_setscheduler(struct task_struct *, int, const struct sched_para
extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
extern void sched_set_fifo(struct task_struct *p);
extern void sched_set_fifo_low(struct task_struct *p);
+extern void sched_set_fifo_secondary(struct task_struct *p);
extern void sched_set_normal(struct task_struct *p, int nice);
extern int sched_setattr(struct task_struct *, const struct sched_attr *);
extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *);
@@ -1865,7 +1917,7 @@ extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)];
#ifdef CONFIG_THREAD_INFO_IN_TASK
# define task_thread_info(task) (&(task)->thread_info)
-#elif !defined(__HAVE_THREAD_FUNCTIONS)
+#else
# define task_thread_info(task) ((struct thread_info *)(task)->stack)
#endif
@@ -1892,26 +1944,33 @@ extern int wake_up_state(struct task_struct *tsk, unsigned int state);
extern int wake_up_process(struct task_struct *tsk);
extern void wake_up_new_task(struct task_struct *tsk);
-#ifdef CONFIG_SMP
extern void kick_process(struct task_struct *tsk);
-#else
-static inline void kick_process(struct task_struct *tsk) { }
-#endif
extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);
+#define set_task_comm(tsk, from) ({ \
+ BUILD_BUG_ON(sizeof(from) != TASK_COMM_LEN); \
+ __set_task_comm(tsk, from, false); \
+})
-static inline void set_task_comm(struct task_struct *tsk, const char *from)
-{
- __set_task_comm(tsk, from, false);
-}
-
-extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk);
+/*
+ * - Why not use task_lock()?
+ * User space can randomly change their names anyway, so locking for readers
+ * doesn't make sense. For writers, locking is probably necessary, as a race
+ * condition could lead to long-term mixed results.
+ * The strscpy_pad() in __set_task_comm() can ensure that the task comm is
+ * always NUL-terminated and zero-padded. Therefore the race condition between
+ * reader and writer is not an issue.
+ *
+ * - BUILD_BUG_ON() can help prevent the buf from being truncated.
+ * Since the callers don't perform any return value checks, this safeguard is
+ * necessary.
+ */
#define get_task_comm(buf, tsk) ({ \
- BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN); \
- __get_task_comm(buf, sizeof(buf), tsk); \
+ BUILD_BUG_ON(sizeof(buf) < TASK_COMM_LEN); \
+ strscpy_pad(buf, (tsk)->comm); \
+ buf; \
})
-#ifdef CONFIG_SMP
static __always_inline void scheduler_ipi(void)
{
/*
@@ -1921,9 +1980,6 @@ static __always_inline void scheduler_ipi(void)
*/
preempt_fold_need_resched();
}
-#else
-static inline void scheduler_ipi(void) { }
-#endif
extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state);
@@ -1964,12 +2020,16 @@ static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
static inline void set_tsk_need_resched(struct task_struct *tsk)
{
+ if (tracepoint_enabled(sched_set_need_resched_tp) &&
+ !test_tsk_thread_flag(tsk, TIF_NEED_RESCHED))
+ __trace_set_need_resched(tsk, TIF_NEED_RESCHED);
set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}
static inline void clear_tsk_need_resched(struct task_struct *tsk)
{
- clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
+ atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY,
+ (atomic_long_t *)&task_thread_info(tsk)->flags);
}
static inline int test_tsk_need_resched(struct task_struct *tsk)
@@ -1977,6 +2037,13 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
}
+static inline void set_need_resched_current(void)
+{
+ lockdep_assert_irqs_disabled();
+ set_tsk_need_resched(current);
+ set_preempt_need_resched();
+}
+
/*
* cond_resched() and cond_resched_lock(): latency reduction via
* explicit rescheduling in places that are safe. The return
@@ -1988,9 +2055,6 @@ extern int __cond_resched(void);
#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
-void sched_dynamic_klp_enable(void);
-void sched_dynamic_klp_disable(void);
-
DECLARE_STATIC_CALL(cond_resched, __cond_resched);
static __always_inline int _cond_resched(void)
@@ -2011,7 +2075,6 @@ static __always_inline int _cond_resched(void)
static inline int _cond_resched(void)
{
- klp_sched_try_switch();
return __cond_resched();
}
@@ -2021,7 +2084,6 @@ static inline int _cond_resched(void)
static inline int _cond_resched(void)
{
- klp_sched_try_switch();
return 0;
}
@@ -2070,46 +2132,71 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock);
__cond_resched_rwlock_write(lock); \
})
-#ifdef CONFIG_PREEMPT_DYNAMIC
-
-extern bool preempt_model_none(void);
-extern bool preempt_model_voluntary(void);
-extern bool preempt_model_full(void);
+#ifndef CONFIG_PREEMPT_RT
+static inline struct mutex *__get_task_blocked_on(struct task_struct *p)
+{
+ struct mutex *m = p->blocked_on;
-#else
+ if (m)
+ lockdep_assert_held_once(&m->wait_lock);
+ return m;
+}
-static inline bool preempt_model_none(void)
+static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m)
{
- return IS_ENABLED(CONFIG_PREEMPT_NONE);
+ struct mutex *blocked_on = READ_ONCE(p->blocked_on);
+
+ WARN_ON_ONCE(!m);
+ /* The task should only be setting itself as blocked */
+ WARN_ON_ONCE(p != current);
+ /* Currently we serialize blocked_on under the mutex::wait_lock */
+ lockdep_assert_held_once(&m->wait_lock);
+ /*
+ * Check ensure we don't overwrite existing mutex value
+ * with a different mutex. Note, setting it to the same
+ * lock repeatedly is ok.
+ */
+ WARN_ON_ONCE(blocked_on && blocked_on != m);
+ WRITE_ONCE(p->blocked_on, m);
}
-static inline bool preempt_model_voluntary(void)
+
+static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m)
{
- return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY);
+ guard(raw_spinlock_irqsave)(&m->wait_lock);
+ __set_task_blocked_on(p, m);
}
-static inline bool preempt_model_full(void)
+
+static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m)
{
- return IS_ENABLED(CONFIG_PREEMPT);
+ if (m) {
+ struct mutex *blocked_on = READ_ONCE(p->blocked_on);
+
+ /* Currently we serialize blocked_on under the mutex::wait_lock */
+ lockdep_assert_held_once(&m->wait_lock);
+ /*
+ * There may be cases where we re-clear already cleared
+ * blocked_on relationships, but make sure we are not
+ * clearing the relationship with a different lock.
+ */
+ WARN_ON_ONCE(blocked_on && blocked_on != m);
+ }
+ WRITE_ONCE(p->blocked_on, NULL);
}
-#endif
-
-static inline bool preempt_model_rt(void)
+static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m)
+{
+ guard(raw_spinlock_irqsave)(&m->wait_lock);
+ __clear_task_blocked_on(p, m);
+}
+#else
+static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
{
- return IS_ENABLED(CONFIG_PREEMPT_RT);
}
-/*
- * Does the preemption model allow non-cooperative preemption?
- *
- * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with
- * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the
- * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the
- * PREEMPT_NONE model.
- */
-static inline bool preempt_model_preemptible(void)
+static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
{
- return preempt_model_full() || preempt_model_rt();
}
+#endif /* !CONFIG_PREEMPT_RT */
static __always_inline bool need_resched(void)
{
@@ -2141,12 +2228,15 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
#endif /* CONFIG_SMP */
+static inline bool task_is_runnable(struct task_struct *p)
+{
+ return p->on_rq && !p->se.sched_delayed;
+}
+
extern bool sched_task_on_rq(struct task_struct *p);
extern unsigned long get_wchan(struct task_struct *p);
extern struct task_struct *cpu_curr_snapshot(int cpu);
-#include <linux/spinlock.h>
-
/*
* In order to reduce various lock holder preemption latencies provide an
* interface to see if a vCPU is currently running or not.
@@ -2169,7 +2259,6 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
#define TASK_SIZE_OF(tsk) TASK_SIZE
#endif
-#ifdef CONFIG_SMP
static inline bool owner_on_cpu(struct task_struct *owner)
{
/*
@@ -2181,7 +2270,6 @@ static inline bool owner_on_cpu(struct task_struct *owner)
/* Returns effective CPU energy utilization, as seen by the scheduler */
unsigned long sched_cpu_util(int cpu);
-#endif /* CONFIG_SMP */
#ifdef CONFIG_SCHED_CORE
extern void sched_core_free(struct task_struct *tsk);
@@ -2216,4 +2304,140 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo
#define alloc_tag_restore(_tag, _old) do {} while (0)
#endif
+/* Avoids recursive inclusion hell */
+#ifdef CONFIG_SCHED_MM_CID
+void sched_mm_cid_before_execve(struct task_struct *t);
+void sched_mm_cid_after_execve(struct task_struct *t);
+void sched_mm_cid_fork(struct task_struct *t);
+void sched_mm_cid_exit(struct task_struct *t);
+static __always_inline int task_mm_cid(struct task_struct *t)
+{
+ return t->mm_cid.cid & ~(MM_CID_ONCPU | MM_CID_TRANSIT);
+}
+#else
+static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
+static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
+static inline void sched_mm_cid_fork(struct task_struct *t) { }
+static inline void sched_mm_cid_exit(struct task_struct *t) { }
+static __always_inline int task_mm_cid(struct task_struct *t)
+{
+ /*
+ * Use the processor id as a fall-back when the mm cid feature is
+ * disabled. This provides functional per-cpu data structure accesses
+ * in user-space, althrough it won't provide the memory usage benefits.
+ */
+ return task_cpu(t);
+}
+#endif
+
+#ifndef MODULE
+#ifndef COMPILE_OFFSETS
+
+extern void ___migrate_enable(void);
+
+struct rq;
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+
+/*
+ * The "struct rq" is not available here, so we can't access the
+ * "runqueues" with this_cpu_ptr(), as the compilation will fail in
+ * this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr():
+ * typeof((ptr) + 0)
+ *
+ * So use arch_raw_cpu_ptr()/PERCPU_PTR() directly here.
+ */
+#ifdef CONFIG_SMP
+#define this_rq_raw() arch_raw_cpu_ptr(&runqueues)
+#else
+#define this_rq_raw() PERCPU_PTR(&runqueues)
+#endif
+#define this_rq_pinned() (*(unsigned int *)((void *)this_rq_raw() + RQ_nr_pinned))
+
+static inline void __migrate_enable(void)
+{
+ struct task_struct *p = current;
+
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Check both overflow from migrate_disable() and superfluous
+ * migrate_enable().
+ */
+ if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
+ return;
+#endif
+
+ if (p->migration_disabled > 1) {
+ p->migration_disabled--;
+ return;
+ }
+
+ /*
+ * Ensure stop_task runs either before or after this, and that
+ * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
+ */
+ guard(preempt)();
+ if (unlikely(p->cpus_ptr != &p->cpus_mask))
+ ___migrate_enable();
+ /*
+ * Mustn't clear migration_disabled() until cpus_ptr points back at the
+ * regular cpus_mask, otherwise things that race (eg.
+ * select_fallback_rq) get confused.
+ */
+ barrier();
+ p->migration_disabled = 0;
+ this_rq_pinned()--;
+}
+
+static inline void __migrate_disable(void)
+{
+ struct task_struct *p = current;
+
+ if (p->migration_disabled) {
+#ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ *Warn about overflow half-way through the range.
+ */
+ WARN_ON_ONCE((s16)p->migration_disabled < 0);
+#endif
+ p->migration_disabled++;
+ return;
+ }
+
+ guard(preempt)();
+ this_rq_pinned()++;
+ p->migration_disabled = 1;
+}
+#else /* !COMPILE_OFFSETS */
+static inline void __migrate_disable(void) { }
+static inline void __migrate_enable(void) { }
+#endif /* !COMPILE_OFFSETS */
+
+/*
+ * So that it is possible to not export the runqueues variable, define and
+ * export migrate_enable/migrate_disable in kernel/sched/core.c too, and use
+ * them for the modules. The macro "INSTANTIATE_EXPORTED_MIGRATE_DISABLE" will
+ * be defined in kernel/sched/core.c.
+ */
+#ifndef INSTANTIATE_EXPORTED_MIGRATE_DISABLE
+static __always_inline void migrate_disable(void)
+{
+ __migrate_disable();
+}
+
+static __always_inline void migrate_enable(void)
+{
+ __migrate_enable();
+}
+#else /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */
+extern void migrate_disable(void);
+extern void migrate_enable(void);
+#endif /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */
+
+#else /* MODULE */
+extern void migrate_disable(void);
+extern void migrate_enable(void);
+#endif /* MODULE */
+
+DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
+
#endif