diff options
Diffstat (limited to 'include/linux/sched')
38 files changed, 3706 insertions, 120 deletions
diff --git a/include/linux/sched/affinity.h b/include/linux/sched/affinity.h new file mode 100644 index 000000000000..227f5be81bcd --- /dev/null +++ b/include/linux/sched/affinity.h @@ -0,0 +1 @@ +#include <linux/sched.h> diff --git a/include/linux/sched/autogroup.h b/include/linux/sched/autogroup.h new file mode 100644 index 000000000000..704391cc1d20 --- /dev/null +++ b/include/linux/sched/autogroup.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_AUTOGROUP_H +#define _LINUX_SCHED_AUTOGROUP_H + +struct signal_struct; +struct task_struct; +struct task_group; +struct seq_file; + +#ifdef CONFIG_SCHED_AUTOGROUP +extern void sched_autogroup_create_attach(struct task_struct *p); +extern void sched_autogroup_detach(struct task_struct *p); +extern void sched_autogroup_fork(struct signal_struct *sig); +extern void sched_autogroup_exit(struct signal_struct *sig); +extern void sched_autogroup_exit_task(struct task_struct *p); +#ifdef CONFIG_PROC_FS +extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m); +extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice); +#endif +#else +static inline void sched_autogroup_create_attach(struct task_struct *p) { } +static inline void sched_autogroup_detach(struct task_struct *p) { } +static inline void sched_autogroup_fork(struct signal_struct *sig) { } +static inline void sched_autogroup_exit(struct signal_struct *sig) { } +static inline void sched_autogroup_exit_task(struct task_struct *p) { } +#endif + +#ifdef CONFIG_CGROUP_SCHED +extern struct task_group root_task_group; +#endif /* CONFIG_CGROUP_SCHED */ + +#endif /* _LINUX_SCHED_AUTOGROUP_H */ diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h new file mode 100644 index 000000000000..196f0ca351a2 --- /dev/null +++ b/include/linux/sched/clock.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_CLOCK_H +#define _LINUX_SCHED_CLOCK_H + +#include <linux/smp.h> + +/* + * Do not use outside of architecture code which knows its limitations. + * + * sched_clock() has no promise of monotonicity or bounded drift between + * CPUs, use (which you should not) requires disabling IRQs. + * + * Please use one of the three interfaces below. + */ +extern u64 sched_clock(void); + +#if defined(CONFIG_ARCH_WANTS_NO_INSTR) || defined(CONFIG_GENERIC_SCHED_CLOCK) +extern u64 sched_clock_noinstr(void); +#else +static __always_inline u64 sched_clock_noinstr(void) +{ + return sched_clock(); +} +#endif + +/* + * See the comment in kernel/sched/clock.c + */ +extern u64 running_clock(void); +extern u64 sched_clock_cpu(int cpu); + + +extern void sched_clock_init(void); + +#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline void sched_clock_tick(void) +{ +} + +static inline void clear_sched_clock_stable(void) +{ +} + +static inline void sched_clock_idle_sleep_event(void) +{ +} + +static inline void sched_clock_idle_wakeup_event(void) +{ +} + +static inline u64 cpu_clock(int cpu) +{ + return sched_clock(); +} + +static __always_inline u64 local_clock_noinstr(void) +{ + return sched_clock_noinstr(); +} + +static __always_inline u64 local_clock(void) +{ + return sched_clock(); +} +#else +extern int sched_clock_stable(void); +extern void clear_sched_clock_stable(void); + +/* + * When sched_clock_stable(), __sched_clock_offset provides the offset + * between local_clock() and sched_clock(). + */ +extern u64 __sched_clock_offset; + +extern void sched_clock_tick(void); +extern void sched_clock_tick_stable(void); +extern void sched_clock_idle_sleep_event(void); +extern void sched_clock_idle_wakeup_event(void); + +/* + * As outlined in clock.c, provides a fast, high resolution, nanosecond + * time source that is monotonic per cpu argument and has bounded drift + * between cpus. + * + * ######################### BIG FAT WARNING ########################## + * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # + * # go backwards !! # + * #################################################################### + */ +static inline u64 cpu_clock(int cpu) +{ + return sched_clock_cpu(cpu); +} + +extern u64 local_clock_noinstr(void); +extern u64 local_clock(void); + +#endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +/* + * An i/f to runtime opt-in for irq time accounting based off of sched_clock. + * The reason for this explicit opt-in is not to have perf penalty with + * slow sched_clocks. + */ +extern void enable_sched_clock_irqtime(void); +extern void disable_sched_clock_irqtime(void); +#else +static inline void enable_sched_clock_irqtime(void) {} +static inline void disable_sched_clock_irqtime(void) {} +#endif + +#endif /* _LINUX_SCHED_CLOCK_H */ diff --git a/include/linux/sched/cond_resched.h b/include/linux/sched/cond_resched.h new file mode 100644 index 000000000000..227f5be81bcd --- /dev/null +++ b/include/linux/sched/cond_resched.h @@ -0,0 +1 @@ +#include <linux/sched.h> diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h new file mode 100644 index 000000000000..624fda17a785 --- /dev/null +++ b/include/linux/sched/coredump.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_COREDUMP_H +#define _LINUX_SCHED_COREDUMP_H + +#include <linux/mm_types.h> + +#define SUID_DUMP_DISABLE 0 /* No setuid dumping */ +#define SUID_DUMP_USER 1 /* Dump as user of process */ +#define SUID_DUMP_ROOT 2 /* Dump as root */ + +static inline unsigned long __mm_flags_get_dumpable(const struct mm_struct *mm) +{ + /* + * By convention, dumpable bits are contained in first 32 bits of the + * bitmap, so we can simply access this first unsigned long directly. + */ + return __mm_flags_get_word(mm); +} + +static inline void __mm_flags_set_mask_dumpable(struct mm_struct *mm, int value) +{ + __mm_flags_set_mask_bits_word(mm, MMF_DUMPABLE_MASK, value); +} + +extern void set_dumpable(struct mm_struct *mm, int value); +/* + * This returns the actual value of the suid_dumpable flag. For things + * that are using this for checking for privilege transitions, it must + * test against SUID_DUMP_USER rather than treating it as a boolean + * value. + */ +static inline int __get_dumpable(unsigned long mm_flags) +{ + return mm_flags & MMF_DUMPABLE_MASK; +} + +static inline int get_dumpable(struct mm_struct *mm) +{ + unsigned long flags = __mm_flags_get_dumpable(mm); + + return __get_dumpable(flags); +} + +#endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h new file mode 100644 index 000000000000..bdd31ab93bc5 --- /dev/null +++ b/include/linux/sched/cpufreq.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_CPUFREQ_H +#define _LINUX_SCHED_CPUFREQ_H + +#include <linux/types.h> + +/* + * Interface between cpufreq drivers and the scheduler: + */ + +#define SCHED_CPUFREQ_IOWAIT (1U << 0) + +#ifdef CONFIG_CPU_FREQ +struct cpufreq_policy; + +struct update_util_data { + void (*func)(struct update_util_data *data, u64 time, unsigned int flags); +}; + +void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, + void (*func)(struct update_util_data *data, u64 time, + unsigned int flags)); +void cpufreq_remove_update_util_hook(int cpu); +bool cpufreq_this_cpu_can_update(struct cpufreq_policy *policy); + +static inline unsigned long map_util_freq(unsigned long util, + unsigned long freq, unsigned long cap) +{ + return freq * util / cap; +} + +static inline unsigned long map_util_perf(unsigned long util) +{ + return util + (util >> 2); +} +#endif /* CONFIG_CPU_FREQ */ + +#endif /* _LINUX_SCHED_CPUFREQ_H */ diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h new file mode 100644 index 000000000000..5f8fd5b24a2e --- /dev/null +++ b/include/linux/sched/cputime.h @@ -0,0 +1,183 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_CPUTIME_H +#define _LINUX_SCHED_CPUTIME_H + +#include <linux/sched/signal.h> + +/* + * cputime accounting APIs: + */ + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +extern bool task_cputime(struct task_struct *t, + u64 *utime, u64 *stime); +extern u64 task_gtime(struct task_struct *t); +#else +static inline bool task_cputime(struct task_struct *t, + u64 *utime, u64 *stime) +{ + *utime = t->utime; + *stime = t->stime; + return false; +} + +static inline u64 task_gtime(struct task_struct *t) +{ + return t->gtime; +} +#endif + +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME +static inline void task_cputime_scaled(struct task_struct *t, + u64 *utimescaled, + u64 *stimescaled) +{ + *utimescaled = t->utimescaled; + *stimescaled = t->stimescaled; +} +#else +static inline void task_cputime_scaled(struct task_struct *t, + u64 *utimescaled, + u64 *stimescaled) +{ + task_cputime(t, utimescaled, stimescaled); +} +#endif + +extern void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st); +extern void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st); +extern void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + u64 *ut, u64 *st); + +/* + * Thread group CPU time accounting. + */ +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times); +void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples); + +/* + * The following are functions that support scheduler-internal time accounting. + * These functions are generally called at the timer tick. None of this depends + * on CONFIG_SCHEDSTATS. + */ + +/** + * get_running_cputimer - return &tsk->signal->cputimer if cputimers are active + * + * @tsk: Pointer to target task. + */ +#ifdef CONFIG_POSIX_TIMERS +static inline +struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + + /* + * Check whether posix CPU timers are active. If not the thread + * group accounting is not active either. Lockless check. + */ + if (!READ_ONCE(tsk->signal->posix_cputimers.timers_active)) + return NULL; + + /* + * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime + * in __exit_signal(), we won't account to the signal struct further + * cputime consumed by that task, even though the task can still be + * ticking after __exit_signal(). + * + * In order to keep a consistent behaviour between thread group cputime + * and thread group cputimer accounting, lets also ignore the cputime + * elapsing after __exit_signal() in any thread group timer running. + * + * This makes sure that POSIX CPU clocks and timers are synchronized, so + * that a POSIX CPU timer won't expire while the corresponding POSIX CPU + * clock delta is behind the expiring timer value. + */ + if (unlikely(!tsk->sighand)) + return NULL; + + return cputimer; +} +#else +static inline +struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk) +{ + return NULL; +} +#endif + +/** + * account_group_user_time - Maintain utime for a thread group. + * + * @tsk: Pointer to task structure. + * @cputime: Time value by which to increment the utime field of the + * thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the utime field there. + */ +static inline void account_group_user_time(struct task_struct *tsk, + u64 cputime) +{ + struct thread_group_cputimer *cputimer = get_running_cputimer(tsk); + + if (!cputimer) + return; + + atomic64_add(cputime, &cputimer->cputime_atomic.utime); +} + +/** + * account_group_system_time - Maintain stime for a thread group. + * + * @tsk: Pointer to task structure. + * @cputime: Time value by which to increment the stime field of the + * thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the stime field there. + */ +static inline void account_group_system_time(struct task_struct *tsk, + u64 cputime) +{ + struct thread_group_cputimer *cputimer = get_running_cputimer(tsk); + + if (!cputimer) + return; + + atomic64_add(cputime, &cputimer->cputime_atomic.stime); +} + +/** + * account_group_exec_runtime - Maintain exec runtime for a thread group. + * + * @tsk: Pointer to task structure. + * @ns: Time value by which to increment the sum_exec_runtime field + * of the thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the sum_exec_runtime field there. + */ +static inline void account_group_exec_runtime(struct task_struct *tsk, + unsigned long long ns) +{ + struct thread_group_cputimer *cputimer = get_running_cputimer(tsk); + + if (!cputimer) + return; + + atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime); +} + +static inline void prev_cputime_init(struct prev_cputime *prev) +{ +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + prev->utime = prev->stime = 0; + raw_spin_lock_init(&prev->lock); +#endif +} + +extern unsigned long long +task_sched_runtime(struct task_struct *task); + +#endif /* _LINUX_SCHED_CPUTIME_H */ diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h new file mode 100644 index 000000000000..c40115d4e34d --- /dev/null +++ b/include/linux/sched/deadline.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_DEADLINE_H +#define _LINUX_SCHED_DEADLINE_H + +/* + * SCHED_DEADLINE tasks has negative priorities, reflecting + * the fact that any of them has higher prio than RT and + * NORMAL/BATCH tasks. + */ + +#include <linux/sched.h> + +static inline bool dl_prio(int prio) +{ + return unlikely(prio < MAX_DL_PRIO); +} + +/* + * Returns true if a task has a priority that belongs to DL class. PI-boosted + * tasks will return true. Use dl_policy() to ignore PI-boosted tasks. + */ +static inline bool dl_task(struct task_struct *p) +{ + return dl_prio(p->prio); +} + +static inline bool dl_time_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +struct root_domain; +extern void dl_add_task_root_domain(struct task_struct *p); +extern void dl_clear_root_domain(struct root_domain *rd); +extern void dl_clear_root_domain_cpu(int cpu); + +extern u64 dl_cookie; +extern bool dl_bw_visited(int cpu, u64 cookie); + +#endif /* _LINUX_SCHED_DEADLINE_H */ diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h new file mode 100644 index 000000000000..35ed4577a6cc --- /dev/null +++ b/include/linux/sched/debug.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_DEBUG_H +#define _LINUX_SCHED_DEBUG_H + +/* + * Various scheduler/task debugging interfaces: + */ + +struct task_struct; +struct pid_namespace; + +extern void dump_cpu_task(int cpu); + +/* + * Only dump TASK_* tasks. (0 for all tasks) + */ +extern void show_state_filter(unsigned int state_filter); + +static inline void show_state(void) +{ + show_state_filter(0); +} + +struct pt_regs; + +extern void show_regs(struct pt_regs *); + +/* + * TASK is a pointer to the task whose backtrace we want to see (or NULL for current + * task), SP is the stack pointer of the first frame that should be shown in the back + * trace (or NULL if the entire call-chain of the task should be shown). + */ +extern void show_stack(struct task_struct *task, unsigned long *sp, + const char *loglvl); + +extern void sched_show_task(struct task_struct *p); + +struct seq_file; +extern void proc_sched_show_task(struct task_struct *p, + struct pid_namespace *ns, struct seq_file *m); +extern void proc_sched_set_task(struct task_struct *p); + +/* Attach to any functions which should be ignored in wchan output. */ +#define __sched __section(".sched.text") + +/* Linker adds these: start and end of __sched functions */ +extern char __sched_text_start[], __sched_text_end[]; + +/* Is this address in the __sched functions? */ +extern int in_sched_functions(unsigned long addr); + +#endif /* _LINUX_SCHED_DEBUG_H */ diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h new file mode 100644 index 000000000000..bcb962d5ee7d --- /dev/null +++ b/include/linux/sched/ext.h @@ -0,0 +1,257 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#ifndef _LINUX_SCHED_EXT_H +#define _LINUX_SCHED_EXT_H + +#ifdef CONFIG_SCHED_CLASS_EXT + +#include <linux/llist.h> +#include <linux/rhashtable-types.h> + +enum scx_public_consts { + SCX_OPS_NAME_LEN = 128, + + /* + * %SCX_SLICE_DFL is used to refill slices when the BPF scheduler misses + * to set the slice for a task that is selected for execution. + * %SCX_EV_REFILL_SLICE_DFL counts the number of times the default slice + * refill has been triggered. + * + * %SCX_SLICE_BYPASS is used as the slice for all tasks in the bypass + * mode. As making forward progress for all tasks is the main goal of + * the bypass mode, a shorter slice is used. + */ + SCX_SLICE_DFL = 20 * 1000000, /* 20ms */ + SCX_SLICE_BYPASS = 5 * 1000000, /* 5ms */ + SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */ +}; + +/* + * DSQ (dispatch queue) IDs are 64bit of the format: + * + * Bits: [63] [62 .. 0] + * [ B] [ ID ] + * + * B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs + * ID: 63 bit ID + * + * Built-in IDs: + * + * Bits: [63] [62] [61..32] [31 .. 0] + * [ 1] [ L] [ R ] [ V ] + * + * 1: 1 for built-in DSQs. + * L: 1 for LOCAL_ON DSQ IDs, 0 for others + * V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value. + */ +enum scx_dsq_id_flags { + SCX_DSQ_FLAG_BUILTIN = 1LLU << 63, + SCX_DSQ_FLAG_LOCAL_ON = 1LLU << 62, + + SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0, + SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1, + SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2, + SCX_DSQ_BYPASS = SCX_DSQ_FLAG_BUILTIN | 3, + SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON, + SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, +}; + +/* + * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered + * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to + * buffer between the scheduler core and the BPF scheduler. See the + * documentation for more details. + */ +struct scx_dispatch_q { + raw_spinlock_t lock; + struct task_struct __rcu *first_task; /* lockless peek at head */ + struct list_head list; /* tasks in dispatch order */ + struct rb_root priq; /* used to order by p->scx.dsq_vtime */ + u32 nr; + u32 seq; /* used by BPF iter */ + u64 id; + struct rhash_head hash_node; + struct llist_node free_node; + struct rcu_head rcu; +}; + +/* scx_entity.flags */ +enum scx_ent_flags { + SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */ + SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ + SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ + + SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */ + SCX_TASK_STATE_BITS = 2, + SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, + + SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */ +}; + +/* scx_entity.flags & SCX_TASK_STATE_MASK */ +enum scx_task_state { + SCX_TASK_NONE, /* ops.init_task() not called yet */ + SCX_TASK_INIT, /* ops.init_task() succeeded, but task can be cancelled */ + SCX_TASK_READY, /* fully initialized, but not in sched_ext */ + SCX_TASK_ENABLED, /* fully initialized and in sched_ext */ + + SCX_TASK_NR_STATES, +}; + +/* scx_entity.dsq_flags */ +enum scx_ent_dsq_flags { + SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* task is queued on the priority queue of a dsq */ +}; + +/* + * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from + * everywhere and the following bits track which kfunc sets are currently + * allowed for %current. This simple per-task tracking works because SCX ops + * nest in a limited way. BPF will likely implement a way to allow and disallow + * kfuncs depending on the calling context which will replace this manual + * mechanism. See scx_kf_allow(). + */ +enum scx_kf_mask { + SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */ + /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ + SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */ + /* + * ops.dispatch() may release rq lock temporarily and thus ENQUEUE and + * SELECT_CPU may be nested inside. ops.dequeue (in REST) may also be + * nested inside DISPATCH. + */ + SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */ + SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */ + SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */ + SCX_KF_REST = 1 << 4, /* other rq-locked operations */ + + __SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH | + SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, + __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, +}; + +enum scx_dsq_lnode_flags { + SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0, + + /* high 16 bits can be for iter cursor flags */ + __SCX_DSQ_LNODE_PRIV_SHIFT = 16, +}; + +struct scx_dsq_list_node { + struct list_head node; + u32 flags; + u32 priv; /* can be used by iter cursor */ +}; + +#define INIT_DSQ_LIST_CURSOR(__node, __flags, __priv) \ + (struct scx_dsq_list_node) { \ + .node = LIST_HEAD_INIT((__node).node), \ + .flags = SCX_DSQ_LNODE_ITER_CURSOR | (__flags), \ + .priv = (__priv), \ + } + +/* + * The following is embedded in task_struct and contains all fields necessary + * for a task to be scheduled by SCX. + */ +struct sched_ext_entity { + struct scx_dispatch_q *dsq; + struct scx_dsq_list_node dsq_list; /* dispatch order */ + struct rb_node dsq_priq; /* p->scx.dsq_vtime order */ + u32 dsq_seq; + u32 dsq_flags; /* protected by DSQ lock */ + u32 flags; /* protected by rq lock */ + u32 weight; + s32 sticky_cpu; + s32 holding_cpu; + s32 selected_cpu; + u32 kf_mask; /* see scx_kf_mask above */ + struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ + atomic_long_t ops_state; + + struct list_head runnable_node; /* rq->scx.runnable_list */ + unsigned long runnable_at; + +#ifdef CONFIG_SCHED_CORE + u64 core_sched_at; /* see scx_prio_less() */ +#endif + u64 ddsp_dsq_id; + u64 ddsp_enq_flags; + + /* BPF scheduler modifiable fields */ + + /* + * Runtime budget in nsecs. This is usually set through + * scx_bpf_dsq_insert() but can also be modified directly by the BPF + * scheduler. Automatically decreased by SCX as the task executes. On + * depletion, a scheduling event is triggered. + * + * This value is cleared to zero if the task is preempted by + * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the + * task ran. Use p->se.sum_exec_runtime instead. + */ + u64 slice; + + /* + * Used to order tasks when dispatching to the vtime-ordered priority + * queue of a dsq. This is usually set through + * scx_bpf_dsq_insert_vtime() but can also be modified directly by the + * BPF scheduler. Modifying it while a task is queued on a dsq may + * mangle the ordering and is not recommended. + */ + u64 dsq_vtime; + + /* + * If set, reject future sched_setscheduler(2) calls updating the policy + * to %SCHED_EXT with -%EACCES. + * + * Can be set from ops.init_task() while the BPF scheduler is being + * loaded (!scx_init_task_args->fork). If set and the task's policy is + * already %SCHED_EXT, the task's policy is rejected and forcefully + * reverted to %SCHED_NORMAL. The number of such events are reported + * through /sys/kernel/debug/sched_ext::nr_rejected. Setting this flag + * during fork is not allowed. + */ + bool disallow; /* reject switching into SCX */ + + /* cold fields */ +#ifdef CONFIG_EXT_GROUP_SCHED + struct cgroup *cgrp_moving_from; +#endif + struct list_head tasks_node; +}; + +void sched_ext_dead(struct task_struct *p); +void print_scx_info(const char *log_lvl, struct task_struct *p); +void scx_softlockup(u32 dur_s); +bool scx_hardlockup(int cpu); +bool scx_rcu_cpu_stall(void); + +#else /* !CONFIG_SCHED_CLASS_EXT */ + +static inline void sched_ext_dead(struct task_struct *p) {} +static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} +static inline void scx_softlockup(u32 dur_s) {} +static inline bool scx_hardlockup(int cpu) { return false; } +static inline bool scx_rcu_cpu_stall(void) { return false; } + +#endif /* CONFIG_SCHED_CLASS_EXT */ + +struct scx_task_group { +#ifdef CONFIG_EXT_GROUP_SCHED + u32 flags; /* SCX_TG_* */ + u32 weight; + u64 bw_period_us; + u64 bw_quota_us; + u64 bw_burst_us; + bool idle; +#endif +}; + +#endif /* _LINUX_SCHED_EXT_H */ diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h new file mode 100644 index 000000000000..17e04859b9a4 --- /dev/null +++ b/include/linux/sched/hotplug.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_HOTPLUG_H +#define _LINUX_SCHED_HOTPLUG_H + +/* + * Scheduler interfaces for hotplug CPU support: + */ + +extern int sched_cpu_starting(unsigned int cpu); +extern int sched_cpu_activate(unsigned int cpu); +extern int sched_cpu_deactivate(unsigned int cpu); + +#ifdef CONFIG_HOTPLUG_CPU +extern int sched_cpu_wait_empty(unsigned int cpu); +extern int sched_cpu_dying(unsigned int cpu); +#else +# define sched_cpu_wait_empty NULL +# define sched_cpu_dying NULL +#endif + +static inline void idle_task_exit(void) {} + +#endif /* _LINUX_SCHED_HOTPLUG_H */ diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h new file mode 100644 index 000000000000..8465ff1f20d1 --- /dev/null +++ b/include/linux/sched/idle.h @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_IDLE_H +#define _LINUX_SCHED_IDLE_H + +#include <linux/sched.h> + +enum cpu_idle_type { + __CPU_NOT_IDLE = 0, + CPU_IDLE, + CPU_NEWLY_IDLE, + CPU_MAX_IDLE_TYPES +}; + +extern void wake_up_if_idle(int cpu); + +/* + * Idle thread specific functions to determine the need_resched + * polling state. + */ +#ifdef TIF_POLLING_NRFLAG + +#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H + +static __always_inline void __current_set_polling(void) +{ + arch_set_bit(TIF_POLLING_NRFLAG, + (unsigned long *)(¤t_thread_info()->flags)); +} + +static __always_inline void __current_clr_polling(void) +{ + arch_clear_bit(TIF_POLLING_NRFLAG, + (unsigned long *)(¤t_thread_info()->flags)); +} + +#else + +static __always_inline void __current_set_polling(void) +{ + set_bit(TIF_POLLING_NRFLAG, + (unsigned long *)(¤t_thread_info()->flags)); +} + +static __always_inline void __current_clr_polling(void) +{ + clear_bit(TIF_POLLING_NRFLAG, + (unsigned long *)(¤t_thread_info()->flags)); +} + +#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H */ + +static __always_inline bool __must_check current_set_polling_and_test(void) +{ + __current_set_polling(); + + /* + * Polling state must be visible before we test NEED_RESCHED, + * paired by resched_curr() + */ + smp_mb__after_atomic(); + + return unlikely(tif_need_resched()); +} + +static __always_inline bool __must_check current_clr_polling_and_test(void) +{ + __current_clr_polling(); + + /* + * Polling state must be visible before we test NEED_RESCHED, + * paired by resched_curr() + */ + smp_mb__after_atomic(); + + return unlikely(tif_need_resched()); +} + +static __always_inline void current_clr_polling(void) +{ + __current_clr_polling(); + + /* + * Ensure we check TIF_NEED_RESCHED after we clear the polling bit. + * Once the bit is cleared, we'll get IPIs with every new + * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also + * fold. + */ + smp_mb__after_atomic(); /* paired with resched_curr() */ + + preempt_fold_need_resched(); +} + +#else +static inline void __current_set_polling(void) { } +static inline void __current_clr_polling(void) { } + +static inline bool __must_check current_set_polling_and_test(void) +{ + return unlikely(tif_need_resched()); +} +static inline bool __must_check current_clr_polling_and_test(void) +{ + return unlikely(tif_need_resched()); +} + +static __always_inline void current_clr_polling(void) +{ + __current_clr_polling(); + + smp_mb(); /* paired with resched_curr() */ + + preempt_fold_need_resched(); +} +#endif + +#endif /* _LINUX_SCHED_IDLE_H */ diff --git a/include/linux/sched/init.h b/include/linux/sched/init.h new file mode 100644 index 000000000000..03542575fd33 --- /dev/null +++ b/include/linux/sched/init.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_INIT_H +#define _LINUX_SCHED_INIT_H + +/* + * Scheduler init related prototypes: + */ + +extern void sched_init(void); +extern void sched_init_smp(void); + +#endif /* _LINUX_SCHED_INIT_H */ diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h new file mode 100644 index 000000000000..d8501f4709b5 --- /dev/null +++ b/include/linux/sched/isolation.h @@ -0,0 +1,80 @@ +#ifndef _LINUX_SCHED_ISOLATION_H +#define _LINUX_SCHED_ISOLATION_H + +#include <linux/cpumask.h> +#include <linux/cpuset.h> +#include <linux/init.h> +#include <linux/tick.h> + +enum hk_type { + HK_TYPE_DOMAIN, + HK_TYPE_MANAGED_IRQ, + HK_TYPE_KERNEL_NOISE, + HK_TYPE_MAX, + + /* + * The following housekeeping types are only set by the nohz_full + * boot commandline option. So they can share the same value. + */ + HK_TYPE_TICK = HK_TYPE_KERNEL_NOISE, + HK_TYPE_TIMER = HK_TYPE_KERNEL_NOISE, + HK_TYPE_RCU = HK_TYPE_KERNEL_NOISE, + HK_TYPE_MISC = HK_TYPE_KERNEL_NOISE, + HK_TYPE_WQ = HK_TYPE_KERNEL_NOISE, + HK_TYPE_KTHREAD = HK_TYPE_KERNEL_NOISE +}; + +#ifdef CONFIG_CPU_ISOLATION +DECLARE_STATIC_KEY_FALSE(housekeeping_overridden); +extern int housekeeping_any_cpu(enum hk_type type); +extern const struct cpumask *housekeeping_cpumask(enum hk_type type); +extern bool housekeeping_enabled(enum hk_type type); +extern void housekeeping_affine(struct task_struct *t, enum hk_type type); +extern bool housekeeping_test_cpu(int cpu, enum hk_type type); +extern void __init housekeeping_init(void); + +#else + +static inline int housekeeping_any_cpu(enum hk_type type) +{ + return smp_processor_id(); +} + +static inline const struct cpumask *housekeeping_cpumask(enum hk_type type) +{ + return cpu_possible_mask; +} + +static inline bool housekeeping_enabled(enum hk_type type) +{ + return false; +} + +static inline void housekeeping_affine(struct task_struct *t, + enum hk_type type) { } + +static inline bool housekeeping_test_cpu(int cpu, enum hk_type type) +{ + return true; +} + +static inline void housekeeping_init(void) { } +#endif /* CONFIG_CPU_ISOLATION */ + +static inline bool housekeeping_cpu(int cpu, enum hk_type type) +{ +#ifdef CONFIG_CPU_ISOLATION + if (static_branch_unlikely(&housekeeping_overridden)) + return housekeeping_test_cpu(cpu, type); +#endif + return true; +} + +static inline bool cpu_is_isolated(int cpu) +{ + return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) || + !housekeeping_test_cpu(cpu, HK_TYPE_TICK) || + cpuset_cpu_is_isolated(cpu); +} + +#endif /* _LINUX_SCHED_ISOLATION_H */ diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h new file mode 100644 index 000000000000..68876d0a7ef9 --- /dev/null +++ b/include/linux/sched/jobctl.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_JOBCTL_H +#define _LINUX_SCHED_JOBCTL_H + +#include <linux/types.h> + +struct task_struct; + +/* + * task->jobctl flags + */ +#define JOBCTL_STOP_SIGMASK 0xffff /* signr of the last group stop */ + +#define JOBCTL_STOP_DEQUEUED_BIT 16 /* stop signal dequeued */ +#define JOBCTL_STOP_PENDING_BIT 17 /* task should stop for group stop */ +#define JOBCTL_STOP_CONSUME_BIT 18 /* consume group stop count */ +#define JOBCTL_TRAP_STOP_BIT 19 /* trap for STOP */ +#define JOBCTL_TRAP_NOTIFY_BIT 20 /* trap for NOTIFY */ +#define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ +#define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */ +#define JOBCTL_TRAP_FREEZE_BIT 23 /* trap for cgroup freezer */ +#define JOBCTL_PTRACE_FROZEN_BIT 24 /* frozen for ptrace */ + +#define JOBCTL_STOPPED_BIT 26 /* do_signal_stop() */ +#define JOBCTL_TRACED_BIT 27 /* ptrace_stop() */ + +#define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT) +#define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT) +#define JOBCTL_STOP_CONSUME (1UL << JOBCTL_STOP_CONSUME_BIT) +#define JOBCTL_TRAP_STOP (1UL << JOBCTL_TRAP_STOP_BIT) +#define JOBCTL_TRAP_NOTIFY (1UL << JOBCTL_TRAP_NOTIFY_BIT) +#define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT) +#define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT) +#define JOBCTL_TRAP_FREEZE (1UL << JOBCTL_TRAP_FREEZE_BIT) +#define JOBCTL_PTRACE_FROZEN (1UL << JOBCTL_PTRACE_FROZEN_BIT) + +#define JOBCTL_STOPPED (1UL << JOBCTL_STOPPED_BIT) +#define JOBCTL_TRACED (1UL << JOBCTL_TRACED_BIT) + +#define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) +#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) + +extern bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask); +extern void task_clear_jobctl_trapping(struct task_struct *task); +extern void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask); + +#endif /* _LINUX_SCHED_JOBCTL_H */ diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h new file mode 100644 index 000000000000..83ec54b65e79 --- /dev/null +++ b/include/linux/sched/loadavg.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_LOADAVG_H +#define _LINUX_SCHED_LOADAVG_H + +/* + * These are the constant used to fake the fixed-point load-average + * counting. Some notes: + * - 11 bit fractions expand to 22 bits by the multiplies: this gives + * a load-average precision of 10 bits integer + 11 bits fractional + * - if you want to count load-averages more often, you need more + * precision, or rounding will get you. With 2-second counting freq, + * the EXP_n values would be 1981, 2034 and 2043 if still using only + * 11 bit fractions. + */ +extern unsigned long avenrun[]; /* Load averages */ +extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift); + +#define FSHIFT 11 /* nr of bits of precision */ +#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */ +#define LOAD_FREQ (5*HZ+1) /* 5 sec intervals */ +#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */ +#define EXP_5 2014 /* 1/exp(5sec/5min) */ +#define EXP_15 2037 /* 1/exp(5sec/15min) */ + +/* + * a1 = a0 * e + a * (1 - e) + */ +static inline unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + unsigned long newload; + + newload = load * exp + active * (FIXED_1 - exp); + if (active >= load) + newload += FIXED_1-1; + + return newload / FIXED_1; +} + +extern unsigned long calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n); + +#define LOAD_INT(x) ((x) >> FSHIFT) +#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) + +extern void calc_global_load(void); + +#endif /* _LINUX_SCHED_LOADAVG_H */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h new file mode 100644 index 000000000000..0e1d73955fa5 --- /dev/null +++ b/include/linux/sched/mm.h @@ -0,0 +1,574 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_MM_H +#define _LINUX_SCHED_MM_H + +#include <linux/kernel.h> +#include <linux/atomic.h> +#include <linux/sched.h> +#include <linux/mm_types.h> +#include <linux/gfp.h> +#include <linux/sync_core.h> +#include <linux/sched/coredump.h> + +/* + * Routines for handling mm_structs + */ +extern struct mm_struct *mm_alloc(void); + +/** + * mmgrab() - Pin a &struct mm_struct. + * @mm: The &struct mm_struct to pin. + * + * Make sure that @mm will not get freed even after the owning task + * exits. This doesn't guarantee that the associated address space + * will still exist later on and mmget_not_zero() has to be used before + * accessing it. + * + * This is a preferred way to pin @mm for a longer/unbounded amount + * of time. + * + * Use mmdrop() to release the reference acquired by mmgrab(). + * + * See also <Documentation/mm/active_mm.rst> for an in-depth explanation + * of &mm_struct.mm_count vs &mm_struct.mm_users. + */ +static inline void mmgrab(struct mm_struct *mm) +{ + atomic_inc(&mm->mm_count); +} + +static inline void smp_mb__after_mmgrab(void) +{ + smp_mb__after_atomic(); +} + +extern void __mmdrop(struct mm_struct *mm); + +static inline void mmdrop(struct mm_struct *mm) +{ + /* + * The implicit full barrier implied by atomic_dec_and_test() is + * required by the membarrier system call before returning to + * user-space, after storing to rq->curr. + */ + if (unlikely(atomic_dec_and_test(&mm->mm_count))) + __mmdrop(mm); +} + +#ifdef CONFIG_PREEMPT_RT +/* + * RCU callback for delayed mm drop. Not strictly RCU, but call_rcu() is + * by far the least expensive way to do that. + */ +static inline void __mmdrop_delayed(struct rcu_head *rhp) +{ + struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop); + + __mmdrop(mm); +} + +/* + * Invoked from finish_task_switch(). Delegates the heavy lifting on RT + * kernels via RCU. + */ +static inline void mmdrop_sched(struct mm_struct *mm) +{ + /* Provides a full memory barrier. See mmdrop() */ + if (atomic_dec_and_test(&mm->mm_count)) + call_rcu(&mm->delayed_drop, __mmdrop_delayed); +} +#else +static inline void mmdrop_sched(struct mm_struct *mm) +{ + mmdrop(mm); +} +#endif + +/* Helpers for lazy TLB mm refcounting */ +static inline void mmgrab_lazy_tlb(struct mm_struct *mm) +{ + if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) + mmgrab(mm); +} + +static inline void mmdrop_lazy_tlb(struct mm_struct *mm) +{ + if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) { + mmdrop(mm); + } else { + /* + * mmdrop_lazy_tlb must provide a full memory barrier, see the + * membarrier comment finish_task_switch which relies on this. + */ + smp_mb(); + } +} + +static inline void mmdrop_lazy_tlb_sched(struct mm_struct *mm) +{ + if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) + mmdrop_sched(mm); + else + smp_mb(); /* see mmdrop_lazy_tlb() above */ +} + +/** + * mmget() - Pin the address space associated with a &struct mm_struct. + * @mm: The address space to pin. + * + * Make sure that the address space of the given &struct mm_struct doesn't + * go away. This does not protect against parts of the address space being + * modified or freed, however. + * + * Never use this function to pin this address space for an + * unbounded/indefinite amount of time. + * + * Use mmput() to release the reference acquired by mmget(). + * + * See also <Documentation/mm/active_mm.rst> for an in-depth explanation + * of &mm_struct.mm_count vs &mm_struct.mm_users. + */ +static inline void mmget(struct mm_struct *mm) +{ + atomic_inc(&mm->mm_users); +} + +static inline bool mmget_not_zero(struct mm_struct *mm) +{ + return atomic_inc_not_zero(&mm->mm_users); +} + +/* mmput gets rid of the mappings and all user-space */ +extern void mmput(struct mm_struct *); +#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH) +/* same as above but performs the slow path from the async context. Can + * be called from the atomic context as well + */ +void mmput_async(struct mm_struct *); +#endif + +/* Grab a reference to a task's mm, if it is not already going away */ +extern struct mm_struct *get_task_mm(struct task_struct *task); +/* + * Grab a reference to a task's mm, if it is not already going away + * and ptrace_may_access with the mode parameter passed to it + * succeeds. + */ +extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); +/* Remove the current tasks stale references to the old mm_struct on exit() */ +extern void exit_mm_release(struct task_struct *, struct mm_struct *); +/* Remove the current tasks stale references to the old mm_struct on exec() */ +extern void exec_mm_release(struct task_struct *, struct mm_struct *); + +#ifdef CONFIG_MEMCG +extern void mm_update_next_owner(struct mm_struct *mm); +#else +static inline void mm_update_next_owner(struct mm_struct *mm) +{ +} +#endif /* CONFIG_MEMCG */ + +#ifdef CONFIG_MMU +#ifndef arch_get_mmap_end +#define arch_get_mmap_end(addr, len, flags) (TASK_SIZE) +#endif + +#ifndef arch_get_mmap_base +#define arch_get_mmap_base(addr, base) (base) +#endif + +extern void arch_pick_mmap_layout(struct mm_struct *mm, + const struct rlimit *rlim_stack); + +unsigned long +arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags); +unsigned long +arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t); + +unsigned long mm_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); + +unsigned long mm_get_unmapped_area_vmflags(struct file *filp, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags, + vm_flags_t vm_flags); + +unsigned long +generic_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags); +unsigned long +generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags); +#else +static inline void arch_pick_mmap_layout(struct mm_struct *mm, + const struct rlimit *rlim_stack) {} +#endif + +static inline bool in_vfork(struct task_struct *tsk) +{ + bool ret; + + /* + * need RCU to access ->real_parent if CLONE_VM was used along with + * CLONE_PARENT. + * + * We check real_parent->mm == tsk->mm because CLONE_VFORK does not + * imply CLONE_VM + * + * CLONE_VFORK can be used with CLONE_PARENT/CLONE_THREAD and thus + * ->real_parent is not necessarily the task doing vfork(), so in + * theory we can't rely on task_lock() if we want to dereference it. + * + * And in this case we can't trust the real_parent->mm == tsk->mm + * check, it can be false negative. But we do not care, if init or + * another oom-unkillable task does this it should blame itself. + */ + rcu_read_lock(); + ret = tsk->vfork_done && + rcu_dereference(tsk->real_parent)->mm == tsk->mm; + rcu_read_unlock(); + + return ret; +} + +/* + * Applies per-task gfp context to the given allocation flags. + * PF_MEMALLOC_NOIO implies GFP_NOIO + * PF_MEMALLOC_NOFS implies GFP_NOFS + * PF_MEMALLOC_PIN implies !GFP_MOVABLE + */ +static inline gfp_t current_gfp_context(gfp_t flags) +{ + unsigned int pflags = READ_ONCE(current->flags); + + if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) { + /* + * NOIO implies both NOIO and NOFS and it is a weaker context + * so always make sure it makes precedence + */ + if (pflags & PF_MEMALLOC_NOIO) + flags &= ~(__GFP_IO | __GFP_FS); + else if (pflags & PF_MEMALLOC_NOFS) + flags &= ~__GFP_FS; + + if (pflags & PF_MEMALLOC_PIN) + flags &= ~__GFP_MOVABLE; + } + return flags; +} + +#ifdef CONFIG_LOCKDEP +extern void __fs_reclaim_acquire(unsigned long ip); +extern void __fs_reclaim_release(unsigned long ip); +extern void fs_reclaim_acquire(gfp_t gfp_mask); +extern void fs_reclaim_release(gfp_t gfp_mask); +#else +static inline void __fs_reclaim_acquire(unsigned long ip) { } +static inline void __fs_reclaim_release(unsigned long ip) { } +static inline void fs_reclaim_acquire(gfp_t gfp_mask) { } +static inline void fs_reclaim_release(gfp_t gfp_mask) { } +#endif + +/* Any memory-allocation retry loop should use + * memalloc_retry_wait(), and pass the flags for the most + * constrained allocation attempt that might have failed. + * This provides useful documentation of where loops are, + * and a central place to fine tune the waiting as the MM + * implementation changes. + */ +static inline void memalloc_retry_wait(gfp_t gfp_flags) +{ + /* We use io_schedule_timeout because waiting for memory + * typically included waiting for dirty pages to be + * written out, which requires IO. + */ + __set_current_state(TASK_UNINTERRUPTIBLE); + gfp_flags = current_gfp_context(gfp_flags); + if (gfpflags_allow_blocking(gfp_flags) && + !(gfp_flags & __GFP_NORETRY)) + /* Probably waited already, no need for much more */ + io_schedule_timeout(1); + else + /* Probably didn't wait, and has now released a lock, + * so now is a good time to wait + */ + io_schedule_timeout(HZ/50); +} + +/** + * might_alloc - Mark possible allocation sites + * @gfp_mask: gfp_t flags that would be used to allocate + * + * Similar to might_sleep() and other annotations, this can be used in functions + * that might allocate, but often don't. Compiles to nothing without + * CONFIG_LOCKDEP. Includes a conditional might_sleep() if @gfp allows blocking. + */ +static inline void might_alloc(gfp_t gfp_mask) +{ + fs_reclaim_acquire(gfp_mask); + fs_reclaim_release(gfp_mask); + + if (current->flags & PF_MEMALLOC) + return; + + might_sleep_if(gfpflags_allow_blocking(gfp_mask)); +} + +/** + * memalloc_flags_save - Add a PF_* flag to current->flags, save old value + * + * This allows PF_* flags to be conveniently added, irrespective of current + * value, and then the old version restored with memalloc_flags_restore(). + */ +static inline unsigned memalloc_flags_save(unsigned flags) +{ + unsigned oldflags = ~current->flags & flags; + current->flags |= flags; + return oldflags; +} + +static inline void memalloc_flags_restore(unsigned flags) +{ + current->flags &= ~flags; +} + +/** + * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope. + * + * This functions marks the beginning of the GFP_NOIO allocation scope. + * All further allocations will implicitly drop __GFP_IO flag and so + * they are safe for the IO critical section from the allocation recursion + * point of view. Use memalloc_noio_restore to end the scope with flags + * returned by this function. + * + * Context: This function is safe to be used from any context. + * Return: The saved flags to be passed to memalloc_noio_restore. + */ +static inline unsigned int memalloc_noio_save(void) +{ + return memalloc_flags_save(PF_MEMALLOC_NOIO); +} + +/** + * memalloc_noio_restore - Ends the implicit GFP_NOIO scope. + * @flags: Flags to restore. + * + * Ends the implicit GFP_NOIO scope started by memalloc_noio_save function. + * Always make sure that the given flags is the return value from the + * pairing memalloc_noio_save call. + */ +static inline void memalloc_noio_restore(unsigned int flags) +{ + memalloc_flags_restore(flags); +} + +/** + * memalloc_nofs_save - Marks implicit GFP_NOFS allocation scope. + * + * This functions marks the beginning of the GFP_NOFS allocation scope. + * All further allocations will implicitly drop __GFP_FS flag and so + * they are safe for the FS critical section from the allocation recursion + * point of view. Use memalloc_nofs_restore to end the scope with flags + * returned by this function. + * + * Context: This function is safe to be used from any context. + * Return: The saved flags to be passed to memalloc_nofs_restore. + */ +static inline unsigned int memalloc_nofs_save(void) +{ + return memalloc_flags_save(PF_MEMALLOC_NOFS); +} + +/** + * memalloc_nofs_restore - Ends the implicit GFP_NOFS scope. + * @flags: Flags to restore. + * + * Ends the implicit GFP_NOFS scope started by memalloc_nofs_save function. + * Always make sure that the given flags is the return value from the + * pairing memalloc_nofs_save call. + */ +static inline void memalloc_nofs_restore(unsigned int flags) +{ + memalloc_flags_restore(flags); +} + +/** + * memalloc_noreclaim_save - Marks implicit __GFP_MEMALLOC scope. + * + * This function marks the beginning of the __GFP_MEMALLOC allocation scope. + * All further allocations will implicitly add the __GFP_MEMALLOC flag, which + * prevents entering reclaim and allows access to all memory reserves. This + * should only be used when the caller guarantees the allocation will allow more + * memory to be freed very shortly, i.e. it needs to allocate some memory in + * the process of freeing memory, and cannot reclaim due to potential recursion. + * + * Users of this scope have to be extremely careful to not deplete the reserves + * completely and implement a throttling mechanism which controls the + * consumption of the reserve based on the amount of freed memory. Usage of a + * pre-allocated pool (e.g. mempool) should be always considered before using + * this scope. + * + * Individual allocations under the scope can opt out using __GFP_NOMEMALLOC + * + * Context: This function should not be used in an interrupt context as that one + * does not give PF_MEMALLOC access to reserves. + * See __gfp_pfmemalloc_flags(). + * Return: The saved flags to be passed to memalloc_noreclaim_restore. + */ +static inline unsigned int memalloc_noreclaim_save(void) +{ + return memalloc_flags_save(PF_MEMALLOC); +} + +/** + * memalloc_noreclaim_restore - Ends the implicit __GFP_MEMALLOC scope. + * @flags: Flags to restore. + * + * Ends the implicit __GFP_MEMALLOC scope started by memalloc_noreclaim_save + * function. Always make sure that the given flags is the return value from the + * pairing memalloc_noreclaim_save call. + */ +static inline void memalloc_noreclaim_restore(unsigned int flags) +{ + memalloc_flags_restore(flags); +} + +/** + * memalloc_pin_save - Marks implicit ~__GFP_MOVABLE scope. + * + * This function marks the beginning of the ~__GFP_MOVABLE allocation scope. + * All further allocations will implicitly remove the __GFP_MOVABLE flag, which + * will constraint the allocations to zones that allow long term pinning, i.e. + * not ZONE_MOVABLE zones. + * + * Return: The saved flags to be passed to memalloc_pin_restore. + */ +static inline unsigned int memalloc_pin_save(void) +{ + return memalloc_flags_save(PF_MEMALLOC_PIN); +} + +/** + * memalloc_pin_restore - Ends the implicit ~__GFP_MOVABLE scope. + * @flags: Flags to restore. + * + * Ends the implicit ~__GFP_MOVABLE scope started by memalloc_pin_save function. + * Always make sure that the given flags is the return value from the pairing + * memalloc_pin_save call. + */ +static inline void memalloc_pin_restore(unsigned int flags) +{ + memalloc_flags_restore(flags); +} + +#ifdef CONFIG_MEMCG +DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); +/** + * set_active_memcg - Starts the remote memcg charging scope. + * @memcg: memcg to charge. + * + * This function marks the beginning of the remote memcg charging scope. All the + * __GFP_ACCOUNT allocations till the end of the scope will be charged to the + * given memcg. + * + * Please, make sure that caller has a reference to the passed memcg structure, + * so its lifetime is guaranteed to exceed the scope between two + * set_active_memcg() calls. + * + * NOTE: This function can nest. Users must save the return value and + * reset the previous value after their own charging scope is over. + */ +static inline struct mem_cgroup * +set_active_memcg(struct mem_cgroup *memcg) +{ + struct mem_cgroup *old; + + if (!in_task()) { + old = this_cpu_read(int_active_memcg); + this_cpu_write(int_active_memcg, memcg); + } else { + old = current->active_memcg; + current->active_memcg = memcg; + } + + return old; +} +#else +static inline struct mem_cgroup * +set_active_memcg(struct mem_cgroup *memcg) +{ + return NULL; +} +#endif + +#ifdef CONFIG_MEMBARRIER +enum { + MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0), + MEMBARRIER_STATE_PRIVATE_EXPEDITED = (1U << 1), + MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY = (1U << 2), + MEMBARRIER_STATE_GLOBAL_EXPEDITED = (1U << 3), + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY = (1U << 4), + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE = (1U << 5), + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY = (1U << 6), + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ = (1U << 7), +}; + +enum { + MEMBARRIER_FLAG_SYNC_CORE = (1U << 0), + MEMBARRIER_FLAG_RSEQ = (1U << 1), +}; + +#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS +#include <asm/membarrier.h> +#endif + +static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) +{ + /* + * The atomic_read() below prevents CSE. The following should + * help the compiler generate more efficient code on architectures + * where sync_core_before_usermode() is a no-op. + */ + if (!IS_ENABLED(CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE)) + return; + if (current->mm != mm) + return; + if (likely(!(atomic_read(&mm->membarrier_state) & + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE))) + return; + sync_core_before_usermode(); +} + +extern void membarrier_exec_mmap(struct mm_struct *mm); + +extern void membarrier_update_current_mm(struct mm_struct *next_mm); + +#else +#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS +static inline void membarrier_arch_switch_mm(struct mm_struct *prev, + struct mm_struct *next, + struct task_struct *tsk) +{ +} +#endif +static inline void membarrier_exec_mmap(struct mm_struct *mm) +{ +} +static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) +{ +} +static inline void membarrier_update_current_mm(struct mm_struct *next_mm) +{ +} +#endif + +#endif /* _LINUX_SCHED_MM_H */ diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h new file mode 100644 index 000000000000..0db7f67935fe --- /dev/null +++ b/include/linux/sched/nohz.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_NOHZ_H +#define _LINUX_SCHED_NOHZ_H + +/* + * This is the interface between the scheduler and nohz/dynticks: + */ + +#ifdef CONFIG_NO_HZ_COMMON +extern void nohz_balance_enter_idle(int cpu); +extern int get_nohz_timer_target(void); +#else +static inline void nohz_balance_enter_idle(int cpu) { } +#endif + +#ifdef CONFIG_NO_HZ_COMMON +void calc_load_nohz_start(void); +void calc_load_nohz_remote(struct rq *rq); +void calc_load_nohz_stop(void); +#else +static inline void calc_load_nohz_start(void) { } +static inline void calc_load_nohz_remote(struct rq *rq) { } +static inline void calc_load_nohz_stop(void) { } +#endif /* CONFIG_NO_HZ_COMMON */ + +#ifdef CONFIG_NO_HZ_COMMON +extern void wake_up_nohz_cpu(int cpu); +#else +static inline void wake_up_nohz_cpu(int cpu) { } +#endif + +#endif /* _LINUX_SCHED_NOHZ_H */ diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h new file mode 100644 index 000000000000..52b22c5c396d --- /dev/null +++ b/include/linux/sched/numa_balancing.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_NUMA_BALANCING_H +#define _LINUX_SCHED_NUMA_BALANCING_H + +/* + * This is the interface between the scheduler and the MM that + * implements memory access pattern based NUMA-balancing: + */ + +#include <linux/sched.h> + +#define TNF_MIGRATED 0x01 +#define TNF_NO_GROUP 0x02 +#define TNF_SHARED 0x04 +#define TNF_FAULT_LOCAL 0x08 +#define TNF_MIGRATE_FAIL 0x10 + +enum numa_vmaskip_reason { + NUMAB_SKIP_UNSUITABLE, + NUMAB_SKIP_SHARED_RO, + NUMAB_SKIP_INACCESSIBLE, + NUMAB_SKIP_SCAN_DELAY, + NUMAB_SKIP_PID_INACTIVE, + NUMAB_SKIP_IGNORE_PID, + NUMAB_SKIP_SEQ_COMPLETED, +}; + +#ifdef CONFIG_NUMA_BALANCING +extern void task_numa_fault(int last_node, int node, int pages, int flags); +extern pid_t task_numa_group_id(struct task_struct *p); +extern void set_numabalancing_state(bool enabled); +extern void task_numa_free(struct task_struct *p, bool final); +bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, + int src_nid, int dst_cpu); +#else +static inline void task_numa_fault(int last_node, int node, int pages, + int flags) +{ +} +static inline pid_t task_numa_group_id(struct task_struct *p) +{ + return 0; +} +static inline void set_numabalancing_state(bool enabled) +{ +} +static inline void task_numa_free(struct task_struct *p, bool final) +{ +} +static inline bool should_numa_migrate_memory(struct task_struct *p, + struct folio *folio, int src_nid, int dst_cpu) +{ + return true; +} +#endif + +#endif /* _LINUX_SCHED_NUMA_BALANCING_H */ diff --git a/include/linux/sched/posix-timers.h b/include/linux/sched/posix-timers.h new file mode 100644 index 000000000000..523a381d6c88 --- /dev/null +++ b/include/linux/sched/posix-timers.h @@ -0,0 +1 @@ +#include <linux/posix-timers.h> diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h new file mode 100644 index 000000000000..6ab43b4f72f9 --- /dev/null +++ b/include/linux/sched/prio.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_PRIO_H +#define _LINUX_SCHED_PRIO_H + +#define MAX_NICE 19 +#define MIN_NICE -20 +#define NICE_WIDTH (MAX_NICE - MIN_NICE + 1) + +/* + * Priority of a process goes from 0..MAX_PRIO-1, valid RT + * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH + * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority + * values are inverted: lower p->prio value means higher priority. + */ + +#define MAX_RT_PRIO 100 +#define MAX_DL_PRIO 0 + +#define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) +#define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) + +/* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], + * and back. + */ +#define NICE_TO_PRIO(nice) ((nice) + DEFAULT_PRIO) +#define PRIO_TO_NICE(prio) ((prio) - DEFAULT_PRIO) + +/* + * Convert nice value [19,-20] to rlimit style value [1,40]. + */ +static inline long nice_to_rlimit(long nice) +{ + return (MAX_NICE - nice + 1); +} + +/* + * Convert rlimit style value [1,40] to nice value [-20, 19]. + */ +static inline long rlimit_to_nice(long prio) +{ + return (MAX_NICE - prio + 1); +} + +#endif /* _LINUX_SCHED_PRIO_H */ diff --git a/include/linux/sched/rseq_api.h b/include/linux/sched/rseq_api.h new file mode 100644 index 000000000000..cf2af72693e1 --- /dev/null +++ b/include/linux/sched/rseq_api.h @@ -0,0 +1 @@ +#include <linux/rseq.h> diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index 440434df3627..4e3338103654 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -1,55 +1,75 @@ -#ifndef _SCHED_RT_H -#define _SCHED_RT_H +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_RT_H +#define _LINUX_SCHED_RT_H -/* - * Priority of a process goes from 0..MAX_PRIO-1, valid RT - * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH - * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority - * values are inverted: lower p->prio value means higher priority. - * - * The MAX_USER_RT_PRIO value allows the actual maximum - * RT priority to be separate from the value exported to - * user-space. This allows kernel threads to set their - * priority to a value higher than any user task. Note: - * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. - */ +#include <linux/sched.h> -#define MAX_USER_RT_PRIO 100 -#define MAX_RT_PRIO MAX_USER_RT_PRIO +struct task_struct; -#define MAX_PRIO (MAX_RT_PRIO + 40) -#define DEFAULT_PRIO (MAX_RT_PRIO + 20) +static inline bool rt_prio(int prio) +{ + return unlikely(prio < MAX_RT_PRIO && prio >= MAX_DL_PRIO); +} -static inline int rt_prio(int prio) +static inline bool rt_or_dl_prio(int prio) { - if (unlikely(prio < MAX_RT_PRIO)) - return 1; - return 0; + return unlikely(prio < MAX_RT_PRIO); } -static inline int rt_task(struct task_struct *p) +/* + * Returns true if a task has a priority that belongs to RT class. PI-boosted + * tasks will return true. Use rt_policy() to ignore PI-boosted tasks. + */ +static inline bool rt_task(struct task_struct *p) { return rt_prio(p->prio); } +/* + * Returns true if a task has a priority that belongs to RT or DL classes. + * PI-boosted tasks will return true. Use rt_or_dl_task_policy() to ignore + * PI-boosted tasks. + */ +static inline bool rt_or_dl_task(struct task_struct *p) +{ + return rt_or_dl_prio(p->prio); +} + +/* + * Returns true if a task has a policy that belongs to RT or DL classes. + * PI-boosted tasks will return false. + */ +static inline bool rt_or_dl_task_policy(struct task_struct *tsk) +{ + int policy = tsk->policy; + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; + if (policy == SCHED_DEADLINE) + return true; + return false; +} + #ifdef CONFIG_RT_MUTEXES -extern int rt_mutex_getprio(struct task_struct *p); -extern void rt_mutex_setprio(struct task_struct *p, int prio); -extern void rt_mutex_adjust_pi(struct task_struct *p); -static inline bool tsk_is_pi_blocked(struct task_struct *tsk) +extern void rt_mutex_pre_schedule(void); +extern void rt_mutex_schedule(void); +extern void rt_mutex_post_schedule(void); + +/* + * Must hold either p->pi_lock or task_rq(p)->lock. + */ +static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p) { - return tsk->pi_blocked_on != NULL; + return p->pi_top_task; } +extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task); +extern void rt_mutex_adjust_pi(struct task_struct *p); #else -static inline int rt_mutex_getprio(struct task_struct *p) +static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task) { - return p->normal_prio; + return NULL; } # define rt_mutex_adjust_pi(p) do { } while (0) -static inline bool tsk_is_pi_blocked(struct task_struct *tsk) -{ - return false; -} #endif extern void normalize_rt_tasks(void); @@ -61,4 +81,4 @@ extern void normalize_rt_tasks(void); */ #define RR_TIMESLICE (100 * HZ / 1000) -#endif /* _SCHED_RT_H */ +#endif /* _LINUX_SCHED_RT_H */ diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h new file mode 100644 index 000000000000..42839cfa2778 --- /dev/null +++ b/include/linux/sched/sd_flags.h @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * sched-domains (multiprocessor balancing) flag declarations. + */ + +#ifndef SD_FLAG +# error "Incorrect import of SD flags definitions" +#endif + +/* + * Hierarchical metaflags + * + * SHARED_CHILD: These flags are meant to be set from the base domain upwards. + * If a domain has this flag set, all of its children should have it set. This + * is usually because the flag describes some shared resource (all CPUs in that + * domain share the same resource), or because they are tied to a scheduling + * behaviour that we want to disable at some point in the hierarchy for + * scalability reasons. + * + * In those cases it doesn't make sense to have the flag set for a domain but + * not have it in (some of) its children: sched domains ALWAYS span their child + * domains, so operations done with parent domains will cover CPUs in the lower + * child domains. + * + * + * SHARED_PARENT: These flags are meant to be set from the highest domain + * downwards. If a domain has this flag set, all of its parents should have it + * set. This is usually for topology properties that start to appear above a + * certain level (e.g. domain starts spanning CPUs outside of the base CPU's + * socket). + */ +#define SDF_SHARED_CHILD 0x1 +#define SDF_SHARED_PARENT 0x2 + +/* + * Behavioural metaflags + * + * NEEDS_GROUPS: These flags are only relevant if the domain they are set on has + * more than one group. This is usually for balancing flags (load balancing + * involves equalizing a metric between groups), or for flags describing some + * shared resource (which would be shared between groups). + */ +#define SDF_NEEDS_GROUPS 0x4 + +/* + * Balance when about to become idle + * + * SHARED_CHILD: Set from the base domain up to cpuset.sched_relax_domain_level. + * NEEDS_GROUPS: Load balancing flag. + */ +SD_FLAG(SD_BALANCE_NEWIDLE, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) + +/* + * Balance on exec + * + * SHARED_CHILD: Set from the base domain up to the NUMA reclaim level. + * NEEDS_GROUPS: Load balancing flag. + */ +SD_FLAG(SD_BALANCE_EXEC, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) + +/* + * Balance on fork, clone + * + * SHARED_CHILD: Set from the base domain up to the NUMA reclaim level. + * NEEDS_GROUPS: Load balancing flag. + */ +SD_FLAG(SD_BALANCE_FORK, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) + +/* + * Balance on wakeup + * + * SHARED_CHILD: Set from the base domain up to cpuset.sched_relax_domain_level. + * NEEDS_GROUPS: Load balancing flag. + */ +SD_FLAG(SD_BALANCE_WAKE, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) + +/* + * Consider waking task on waking CPU. + * + * SHARED_CHILD: Set from the base domain up to the NUMA reclaim level. + */ +SD_FLAG(SD_WAKE_AFFINE, SDF_SHARED_CHILD) + +/* + * Domain members have different CPU capacities + * + * SHARED_PARENT: Set from the topmost domain down to the first domain where + * asymmetry is detected. + * NEEDS_GROUPS: Per-CPU capacity is asymmetric between groups. + */ +SD_FLAG(SD_ASYM_CPUCAPACITY, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) + +/* + * Domain members have different CPU capacities spanning all unique CPU + * capacity values. + * + * SHARED_PARENT: Set from the topmost domain down to the first domain where + * all available CPU capacities are visible + * NEEDS_GROUPS: Per-CPU capacity is asymmetric between groups. + */ +SD_FLAG(SD_ASYM_CPUCAPACITY_FULL, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) + +/* + * Domain members share CPU capacity (i.e. SMT) + * + * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share + * CPU capacity. + * NEEDS_GROUPS: Capacity is shared between groups. + */ +SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) + +/* + * Domain members share CPU cluster (LLC tags or L2 cache) + * + * NEEDS_GROUPS: Clusters are shared between groups. + */ +SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS) + +/* + * Domain members share CPU Last Level Caches + * + * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share + * the same cache(s). + * NEEDS_GROUPS: Caches are shared between groups. + */ +SD_FLAG(SD_SHARE_LLC, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) + +/* + * Only a single load balancing instance + * + * SHARED_PARENT: Set for all NUMA levels above NODE. Could be set from a + * different level upwards, but it doesn't change that if a + * domain has this flag set, then all of its parents need to have + * it too (otherwise the serialization doesn't make sense). + * NEEDS_GROUPS: No point in preserving domain if it has a single group. + */ +SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) + +/* + * Place busy tasks earlier in the domain + * + * NEEDS_GROUPS: Load balancing flag. + */ +SD_FLAG(SD_ASYM_PACKING, SDF_NEEDS_GROUPS) + +/* + * Prefer to place tasks in a sibling domain + * + * Set up until domains start spanning NUMA nodes. Close to being a SHARED_CHILD + * flag, but cleared below domains with SD_ASYM_CPUCAPACITY. + * + * NEEDS_GROUPS: Load balancing flag. + */ +SD_FLAG(SD_PREFER_SIBLING, SDF_NEEDS_GROUPS) + +/* + * Cross-node balancing + * + * SHARED_PARENT: Set for all NUMA levels above NODE. + * NEEDS_GROUPS: No point in preserving domain if it has a single group. + */ +SD_FLAG(SD_NUMA, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h new file mode 100644 index 000000000000..7d6449982822 --- /dev/null +++ b/include/linux/sched/signal.h @@ -0,0 +1,787 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_SIGNAL_H +#define _LINUX_SCHED_SIGNAL_H + +#include <linux/rculist.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/sched/jobctl.h> +#include <linux/sched/task.h> +#include <linux/cred.h> +#include <linux/refcount.h> +#include <linux/pid.h> +#include <linux/posix-timers.h> +#include <linux/mm_types.h> +#include <asm/ptrace.h> + +/* + * Types defining task->signal and task->sighand and APIs using them: + */ + +struct sighand_struct { + spinlock_t siglock; + refcount_t count; + wait_queue_head_t signalfd_wqh; + struct k_sigaction action[_NSIG]; +}; + +/* + * Per-process accounting stats: + */ +struct pacct_struct { + int ac_flag; + long ac_exitcode; + unsigned long ac_mem; + u64 ac_utime, ac_stime; + unsigned long ac_minflt, ac_majflt; +}; + +struct cpu_itimer { + u64 expires; + u64 incr; +}; + +/* + * This is the atomic variant of task_cputime, which can be used for + * storing and updating task_cputime statistics without locking. + */ +struct task_cputime_atomic { + atomic64_t utime; + atomic64_t stime; + atomic64_t sum_exec_runtime; +}; + +#define INIT_CPUTIME_ATOMIC \ + (struct task_cputime_atomic) { \ + .utime = ATOMIC64_INIT(0), \ + .stime = ATOMIC64_INIT(0), \ + .sum_exec_runtime = ATOMIC64_INIT(0), \ + } +/** + * struct thread_group_cputimer - thread group interval timer counts + * @cputime_atomic: atomic thread group interval timers. + * + * This structure contains the version of task_cputime, above, that is + * used for thread group CPU timer calculations. + */ +struct thread_group_cputimer { + struct task_cputime_atomic cputime_atomic; +}; + +struct multiprocess_signals { + sigset_t signal; + struct hlist_node node; +}; + +struct core_thread { + struct task_struct *task; + struct core_thread *next; +}; + +struct core_state { + atomic_t nr_threads; + struct core_thread dumper; + struct completion startup; +}; + +/* + * NOTE! "signal_struct" does not have its own + * locking, because a shared signal_struct always + * implies a shared sighand_struct, so locking + * sighand_struct is always a proper superset of + * the locking of signal_struct. + */ +struct signal_struct { + refcount_t sigcnt; + atomic_t live; + int nr_threads; + int quick_threads; + struct list_head thread_head; + + wait_queue_head_t wait_chldexit; /* for wait4() */ + + /* current thread group signal load-balancing target: */ + struct task_struct *curr_target; + + /* shared signal handling: */ + struct sigpending shared_pending; + + /* For collecting multiprocess signals during fork */ + struct hlist_head multiprocess; + + /* thread group exit support */ + int group_exit_code; + /* notify group_exec_task when notify_count is less or equal to 0 */ + int notify_count; + struct task_struct *group_exec_task; + + /* thread group stop support, overloads group_exit_code too */ + int group_stop_count; + unsigned int flags; /* see SIGNAL_* flags below */ + + struct core_state *core_state; /* coredumping support */ + + /* + * PR_SET_CHILD_SUBREAPER marks a process, like a service + * manager, to re-parent orphan (double-forking) child processes + * to this process instead of 'init'. The service manager is + * able to receive SIGCHLD signals and is able to investigate + * the process until it calls wait(). All children of this + * process will inherit a flag if they should look for a + * child_subreaper process at exit. + */ + unsigned int is_child_subreaper:1; + unsigned int has_child_subreaper:1; + +#ifdef CONFIG_POSIX_TIMERS + + /* POSIX.1b Interval Timers */ + unsigned int timer_create_restore_ids:1; + atomic_t next_posix_timer_id; + struct hlist_head posix_timers; + struct hlist_head ignored_posix_timers; + + /* ITIMER_REAL timer for the process */ + struct hrtimer real_timer; + ktime_t it_real_incr; + + /* + * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use + * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these + * values are defined to 0 and 1 respectively + */ + struct cpu_itimer it[2]; + + /* + * Thread group totals for process CPU timers. + * See thread_group_cputimer(), et al, for details. + */ + struct thread_group_cputimer cputimer; + +#endif + /* Empty if CONFIG_POSIX_TIMERS=n */ + struct posix_cputimers posix_cputimers; + + /* PID/PID hash table linkage. */ + struct pid *pids[PIDTYPE_MAX]; + +#ifdef CONFIG_NO_HZ_FULL + atomic_t tick_dep_mask; +#endif + + struct pid *tty_old_pgrp; + + /* boolean value for session group leader */ + int leader; + + struct tty_struct *tty; /* NULL if no tty */ + +#ifdef CONFIG_SCHED_AUTOGROUP + struct autogroup *autogroup; +#endif + /* + * Cumulative resource counters for dead threads in the group, + * and for reaped dead child processes forked by this group. + * Live threads maintain their own counters and add to these + * in __exit_signal, except for the group leader. + */ + seqlock_t stats_lock; + u64 utime, stime, cutime, cstime; + u64 gtime; + u64 cgtime; + struct prev_cputime prev_cputime; + unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; + unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; + unsigned long inblock, oublock, cinblock, coublock; + unsigned long maxrss, cmaxrss; + struct task_io_accounting ioac; + + /* + * Cumulative ns of schedule CPU time fo dead threads in the + * group, not including a zombie group leader, (This only differs + * from jiffies_to_ns(utime + stime) if sched_clock uses something + * other than jiffies.) + */ + unsigned long long sum_sched_runtime; + + /* + * We don't bother to synchronize most readers of this at all, + * because there is no reader checking a limit that actually needs + * to get both rlim_cur and rlim_max atomically, and either one + * alone is a single word that can safely be read normally. + * getrlimit/setrlimit use task_lock(current->group_leader) to + * protect this instead of the siglock, because they really + * have no need to disable irqs. + */ + struct rlimit rlim[RLIM_NLIMITS]; + +#ifdef CONFIG_BSD_PROCESS_ACCT + struct pacct_struct pacct; /* per-process accounting information */ +#endif +#ifdef CONFIG_TASKSTATS + struct taskstats *stats; +#endif +#ifdef CONFIG_AUDIT + unsigned audit_tty; + struct tty_audit_buf *tty_audit_buf; +#endif + +#ifdef CONFIG_CGROUPS + struct rw_semaphore cgroup_threadgroup_rwsem; +#endif + + /* + * Thread is the potential origin of an oom condition; kill first on + * oom + */ + bool oom_flag_origin; + short oom_score_adj; /* OOM kill score adjustment */ + short oom_score_adj_min; /* OOM kill score adjustment min value. + * Only settable by CAP_SYS_RESOURCE. */ + struct mm_struct *oom_mm; /* recorded mm when the thread group got + * killed by the oom killer */ + + struct mutex cred_guard_mutex; /* guard against foreign influences on + * credential calculations + * (notably. ptrace) + * Deprecated do not use in new code. + * Use exec_update_lock instead. + */ + struct rw_semaphore exec_update_lock; /* Held while task_struct is + * being updated during exec, + * and may have inconsistent + * permissions. + */ +} __randomize_layout; + +/* + * Bits in flags field of signal_struct. + */ +#define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ +#define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */ +#define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */ +/* + * Pending notifications to parent. + */ +#define SIGNAL_CLD_STOPPED 0x00000010 +#define SIGNAL_CLD_CONTINUED 0x00000020 +#define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED) + +#define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */ + +#define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \ + SIGNAL_STOP_CONTINUED) + +static inline void signal_set_stop_flags(struct signal_struct *sig, + unsigned int flags) +{ + WARN_ON(sig->flags & SIGNAL_GROUP_EXIT); + sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags; +} + +extern void flush_signals(struct task_struct *); +extern void ignore_signals(struct task_struct *); +extern void flush_signal_handlers(struct task_struct *, int force_default); +extern int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type); + +static inline int kernel_dequeue_signal(void) +{ + struct task_struct *task = current; + kernel_siginfo_t __info; + enum pid_type __type; + int ret; + + spin_lock_irq(&task->sighand->siglock); + ret = dequeue_signal(&task->blocked, &__info, &__type); + spin_unlock_irq(&task->sighand->siglock); + + return ret; +} + +static inline void kernel_signal_stop(void) +{ + spin_lock_irq(¤t->sighand->siglock); + if (current->jobctl & JOBCTL_STOP_DEQUEUED) { + current->jobctl |= JOBCTL_STOPPED; + set_special_state(TASK_STOPPED); + } + spin_unlock_irq(¤t->sighand->siglock); + + schedule(); +} + +int force_sig_fault_to_task(int sig, int code, void __user *addr, + struct task_struct *t); +int force_sig_fault(int sig, int code, void __user *addr); +int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t); + +int force_sig_mceerr(int code, void __user *, short); +int send_sig_mceerr(int code, void __user *, short, struct task_struct *); + +int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper); +int force_sig_pkuerr(void __user *addr, u32 pkey); +int send_sig_perf(void __user *addr, u32 type, u64 sig_data); + +int force_sig_ptrace_errno_trap(int errno, void __user *addr); +int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno); +int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno, + struct task_struct *t); +int force_sig_seccomp(int syscall, int reason, bool force_coredump); + +extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *); +extern void force_sigsegv(int sig); +extern int force_sig_info(struct kernel_siginfo *); +extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp); +extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid); +extern int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, struct pid *, + const struct cred *); +extern int kill_pgrp(struct pid *pid, int sig, int priv); +extern int kill_pid(struct pid *pid, int sig, int priv); +extern __must_check bool do_notify_parent(struct task_struct *, int); +extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); +extern void force_sig(int); +extern void force_fatal_sig(int); +extern void force_exit_sig(int); +extern int send_sig(int, struct task_struct *, int); +extern int zap_other_threads(struct task_struct *p); +extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); + +static inline void clear_notify_signal(void) +{ + clear_thread_flag(TIF_NOTIFY_SIGNAL); + smp_mb__after_atomic(); +} + +/* + * Returns 'true' if kick_process() is needed to force a transition from + * user -> kernel to guarantee expedient run of TWA_SIGNAL based task_work. + */ +static inline bool __set_notify_signal(struct task_struct *task) +{ + return !test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && + !wake_up_state(task, TASK_INTERRUPTIBLE); +} + +/* + * Called to break out of interruptible wait loops, and enter the + * exit_to_user_mode_loop(). + */ +static inline void set_notify_signal(struct task_struct *task) +{ + if (__set_notify_signal(task)) + kick_process(task); +} + +static inline int restart_syscall(void) +{ + set_tsk_thread_flag(current, TIF_SIGPENDING); + return -ERESTARTNOINTR; +} + +static inline int task_sigpending(struct task_struct *p) +{ + return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); +} + +static inline int signal_pending(struct task_struct *p) +{ + /* + * TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same + * behavior in terms of ensuring that we break out of wait loops + * so that notify signal callbacks can be processed. + */ + if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL))) + return 1; + return task_sigpending(p); +} + +static inline int __fatal_signal_pending(struct task_struct *p) +{ + return unlikely(sigismember(&p->pending.signal, SIGKILL)); +} + +static inline int fatal_signal_pending(struct task_struct *p) +{ + return task_sigpending(p) && __fatal_signal_pending(p); +} + +static inline int signal_pending_state(unsigned int state, struct task_struct *p) +{ + if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL))) + return 0; + if (!signal_pending(p)) + return 0; + + return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); +} + +/* + * This should only be used in fault handlers to decide whether we + * should stop the current fault routine to handle the signals + * instead, especially with the case where we've got interrupted with + * a VM_FAULT_RETRY. + */ +static inline bool fault_signal_pending(vm_fault_t fault_flags, + struct pt_regs *regs) +{ + return unlikely((fault_flags & VM_FAULT_RETRY) && + (fatal_signal_pending(current) || + (user_mode(regs) && signal_pending(current)))); +} + +/* + * Reevaluate whether the task has signals pending delivery. + * Wake the task if so. + * This is required every time the blocked sigset_t changes. + * callers must hold sighand->siglock. + */ +extern void recalc_sigpending(void); +extern void calculate_sigpending(void); + +extern void signal_wake_up_state(struct task_struct *t, unsigned int state); + +static inline void signal_wake_up(struct task_struct *t, bool fatal) +{ + unsigned int state = 0; + if (fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN)) { + t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED); + state = TASK_WAKEKILL | __TASK_TRACED; + } + signal_wake_up_state(t, state); +} +static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) +{ + unsigned int state = 0; + if (resume) { + t->jobctl &= ~JOBCTL_TRACED; + state = __TASK_TRACED; + } + signal_wake_up_state(t, state); +} + +void task_join_group_stop(struct task_struct *task); + +#ifdef TIF_RESTORE_SIGMASK +/* + * Legacy restore_sigmask accessors. These are inefficient on + * SMP architectures because they require atomic operations. + */ + +/** + * set_restore_sigmask() - make sure saved_sigmask processing gets done + * + * This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code + * will run before returning to user mode, to process the flag. For + * all callers, TIF_SIGPENDING is already set or it's no harm to set + * it. TIF_RESTORE_SIGMASK need not be in the set of bits that the + * arch code will notice on return to user mode, in case those bits + * are scarce. We set TIF_SIGPENDING here to ensure that the arch + * signal code always gets run when TIF_RESTORE_SIGMASK is set. + */ +static inline void set_restore_sigmask(void) +{ + set_thread_flag(TIF_RESTORE_SIGMASK); +} + +static inline void clear_tsk_restore_sigmask(struct task_struct *task) +{ + clear_tsk_thread_flag(task, TIF_RESTORE_SIGMASK); +} + +static inline void clear_restore_sigmask(void) +{ + clear_thread_flag(TIF_RESTORE_SIGMASK); +} +static inline bool test_tsk_restore_sigmask(struct task_struct *task) +{ + return test_tsk_thread_flag(task, TIF_RESTORE_SIGMASK); +} +static inline bool test_restore_sigmask(void) +{ + return test_thread_flag(TIF_RESTORE_SIGMASK); +} +static inline bool test_and_clear_restore_sigmask(void) +{ + return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK); +} + +#else /* TIF_RESTORE_SIGMASK */ + +/* Higher-quality implementation, used if TIF_RESTORE_SIGMASK doesn't exist. */ +static inline void set_restore_sigmask(void) +{ + current->restore_sigmask = true; +} +static inline void clear_tsk_restore_sigmask(struct task_struct *task) +{ + task->restore_sigmask = false; +} +static inline void clear_restore_sigmask(void) +{ + current->restore_sigmask = false; +} +static inline bool test_restore_sigmask(void) +{ + return current->restore_sigmask; +} +static inline bool test_tsk_restore_sigmask(struct task_struct *task) +{ + return task->restore_sigmask; +} +static inline bool test_and_clear_restore_sigmask(void) +{ + if (!current->restore_sigmask) + return false; + current->restore_sigmask = false; + return true; +} +#endif + +static inline void restore_saved_sigmask(void) +{ + if (test_and_clear_restore_sigmask()) + __set_current_blocked(¤t->saved_sigmask); +} + +extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize); + +static inline void restore_saved_sigmask_unless(bool interrupted) +{ + if (interrupted) + WARN_ON(!signal_pending(current)); + else + restore_saved_sigmask(); +} + +static inline sigset_t *sigmask_to_save(void) +{ + sigset_t *res = ¤t->blocked; + if (unlikely(test_restore_sigmask())) + res = ¤t->saved_sigmask; + return res; +} + +static inline int kill_cad_pid(int sig, int priv) +{ + return kill_pid(cad_pid, sig, priv); +} + +/* These can be the second arg to send_sig_info/send_group_sig_info. */ +#define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0) +#define SEND_SIG_PRIV ((struct kernel_siginfo *) 1) + +static inline int __on_sig_stack(unsigned long sp) +{ +#ifdef CONFIG_STACK_GROWSUP + return sp >= current->sas_ss_sp && + sp - current->sas_ss_sp < current->sas_ss_size; +#else + return sp > current->sas_ss_sp && + sp - current->sas_ss_sp <= current->sas_ss_size; +#endif +} + +/* + * True if we are on the alternate signal stack. + */ +static inline int on_sig_stack(unsigned long sp) +{ + /* + * If the signal stack is SS_AUTODISARM then, by construction, we + * can't be on the signal stack unless user code deliberately set + * SS_AUTODISARM when we were already on it. + * + * This improves reliability: if user state gets corrupted such that + * the stack pointer points very close to the end of the signal stack, + * then this check will enable the signal to be handled anyway. + */ + if (current->sas_ss_flags & SS_AUTODISARM) + return 0; + + return __on_sig_stack(sp); +} + +static inline int sas_ss_flags(unsigned long sp) +{ + if (!current->sas_ss_size) + return SS_DISABLE; + + return on_sig_stack(sp) ? SS_ONSTACK : 0; +} + +static inline void sas_ss_reset(struct task_struct *p) +{ + p->sas_ss_sp = 0; + p->sas_ss_size = 0; + p->sas_ss_flags = SS_DISABLE; +} + +static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig) +{ + if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp)) +#ifdef CONFIG_STACK_GROWSUP + return current->sas_ss_sp; +#else + return current->sas_ss_sp + current->sas_ss_size; +#endif + return sp; +} + +extern void __cleanup_sighand(struct sighand_struct *); +extern void flush_itimer_signals(void); + +#define tasklist_empty() \ + list_empty(&init_task.tasks) + +#define next_task(p) \ + list_entry_rcu((p)->tasks.next, struct task_struct, tasks) + +#define for_each_process(p) \ + for (p = &init_task ; (p = next_task(p)) != &init_task ; ) + +extern bool current_is_single_threaded(void); + +/* + * Without tasklist/siglock it is only rcu-safe if g can't exit/exec, + * otherwise next_thread(t) will never reach g after list_del_rcu(g). + */ +#define while_each_thread(g, t) \ + while ((t = next_thread(t)) != g) + +#define for_other_threads(p, t) \ + for (t = p; (t = next_thread(t)) != p; ) + +#define __for_each_thread(signal, t) \ + list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \ + lockdep_is_held(&tasklist_lock)) + +#define for_each_thread(p, t) \ + __for_each_thread((p)->signal, t) + +/* Careful: this is a double loop, 'break' won't work as expected. */ +#define for_each_process_thread(p, t) \ + for_each_process(p) for_each_thread(p, t) + +typedef int (*proc_visitor)(struct task_struct *p, void *data); +void walk_process_tree(struct task_struct *top, proc_visitor, void *); + +static inline +struct pid *task_pid_type(struct task_struct *task, enum pid_type type) +{ + struct pid *pid; + if (type == PIDTYPE_PID) + pid = task_pid(task); + else + pid = task->signal->pids[type]; + return pid; +} + +static inline struct pid *task_tgid(struct task_struct *task) +{ + return task->signal->pids[PIDTYPE_TGID]; +} + +/* + * Without tasklist or RCU lock it is not safe to dereference + * the result of task_pgrp/task_session even if task == current, + * we can race with another thread doing sys_setsid/sys_setpgid. + */ +static inline struct pid *task_pgrp(struct task_struct *task) +{ + return task->signal->pids[PIDTYPE_PGID]; +} + +static inline struct pid *task_session(struct task_struct *task) +{ + return task->signal->pids[PIDTYPE_SID]; +} + +static inline int get_nr_threads(struct task_struct *task) +{ + return task->signal->nr_threads; +} + +static inline bool thread_group_leader(struct task_struct *p) +{ + return p->exit_signal >= 0; +} + +static inline +bool same_thread_group(struct task_struct *p1, struct task_struct *p2) +{ + return p1->signal == p2->signal; +} + +/* + * returns NULL if p is the last thread in the thread group + */ +static inline struct task_struct *__next_thread(struct task_struct *p) +{ + return list_next_or_null_rcu(&p->signal->thread_head, + &p->thread_node, + struct task_struct, + thread_node); +} + +static inline struct task_struct *next_thread(struct task_struct *p) +{ + return __next_thread(p) ?: p->group_leader; +} + +static inline int thread_group_empty(struct task_struct *p) +{ + return thread_group_leader(p) && + list_is_last(&p->thread_node, &p->signal->thread_head); +} + +#define delay_group_leader(p) \ + (thread_group_leader(p) && !thread_group_empty(p)) + +extern struct sighand_struct *__lock_task_sighand(struct task_struct *task, + unsigned long *flags); + +static inline struct sighand_struct *lock_task_sighand(struct task_struct *task, + unsigned long *flags) +{ + struct sighand_struct *ret; + + ret = __lock_task_sighand(task, flags); + (void)__cond_lock(&task->sighand->siglock, ret); + return ret; +} + +static inline void unlock_task_sighand(struct task_struct *task, + unsigned long *flags) +{ + spin_unlock_irqrestore(&task->sighand->siglock, *flags); +} + +#ifdef CONFIG_LOCKDEP +extern void lockdep_assert_task_sighand_held(struct task_struct *task); +#else +static inline void lockdep_assert_task_sighand_held(struct task_struct *task) { } +#endif + +static inline unsigned long task_rlimit(const struct task_struct *task, + unsigned int limit) +{ + return READ_ONCE(task->signal->rlim[limit].rlim_cur); +} + +static inline unsigned long task_rlimit_max(const struct task_struct *task, + unsigned int limit) +{ + return READ_ONCE(task->signal->rlim[limit].rlim_max); +} + +static inline unsigned long rlimit(unsigned int limit) +{ + return task_rlimit(current, limit); +} + +static inline unsigned long rlimit_max(unsigned int limit) +{ + return task_rlimit_max(current, limit); +} + +#endif /* _LINUX_SCHED_SIGNAL_H */ diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h new file mode 100644 index 000000000000..166b19af956f --- /dev/null +++ b/include/linux/sched/smt.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_SMT_H +#define _LINUX_SCHED_SMT_H + +#include <linux/static_key.h> + +#ifdef CONFIG_SCHED_SMT +extern struct static_key_false sched_smt_present; + +static __always_inline bool sched_smt_active(void) +{ + return static_branch_likely(&sched_smt_present); +} +#else +static __always_inline bool sched_smt_active(void) { return false; } +#endif + +void arch_smt_update(void); + +#endif /* _LINUX_SCHED_SMT_H */ diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h new file mode 100644 index 000000000000..0108a38bb64d --- /dev/null +++ b/include/linux/sched/stat.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_STAT_H +#define _LINUX_SCHED_STAT_H + +#include <linux/percpu.h> +#include <linux/kconfig.h> + +/* + * Various counters maintained by the scheduler and fork(), + * exposed via /proc, sys.c or used by drivers via these APIs. + * + * ( Note that all these values are acquired without locking, + * so they can only be relied on in narrow circumstances. ) + */ + +extern unsigned long total_forks; +extern int nr_threads; +DECLARE_PER_CPU(unsigned long, process_counts); +extern int nr_processes(void); +extern unsigned int nr_running(void); +extern bool single_task_running(void); +extern unsigned int nr_iowait(void); +extern unsigned int nr_iowait_cpu(int cpu); + +static inline int sched_info_on(void) +{ + return IS_ENABLED(CONFIG_SCHED_INFO); +} + +#ifdef CONFIG_SCHEDSTATS +void force_schedstat_enabled(void); +#endif + +#endif /* _LINUX_SCHED_STAT_H */ diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index bf8086b2506e..5a64582b086b 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -1,104 +1,32 @@ -#ifndef _SCHED_SYSCTL_H -#define _SCHED_SYSCTL_H +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_SYSCTL_H +#define _LINUX_SCHED_SYSCTL_H + +#include <linux/types.h> #ifdef CONFIG_DETECT_HUNG_TASK -extern unsigned int sysctl_hung_task_panic; -extern unsigned long sysctl_hung_task_check_count; +/* used for hung_task and block/ */ extern unsigned long sysctl_hung_task_timeout_secs; -extern unsigned long sysctl_hung_task_warnings; -extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos); #else /* Avoid need for ifdefs elsewhere in the code */ enum { sysctl_hung_task_timeout_secs = 0 }; #endif -/* - * Default maximum number of active map areas, this limits the number of vmas - * per mm struct. Users can overwrite this number by sysctl but there is a - * problem. - * - * When a program's coredump is generated as ELF format, a section is created - * per a vma. In ELF, the number of sections is represented in unsigned short. - * This means the number of sections should be smaller than 65535 at coredump. - * Because the kernel adds some informative sections to a image of program at - * generating coredump, we need some margin. The number of extra sections is - * 1-3 now and depends on arch. We use "5" as safe margin, here. - */ -#define MAPCOUNT_ELF_CORE_MARGIN (5) -#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) - -extern int sysctl_max_map_count; - -extern unsigned int sysctl_sched_latency; -extern unsigned int sysctl_sched_min_granularity; -extern unsigned int sysctl_sched_wakeup_granularity; -extern unsigned int sysctl_sched_child_runs_first; - enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, SCHED_TUNABLESCALING_LOG, SCHED_TUNABLESCALING_LINEAR, SCHED_TUNABLESCALING_END, }; -extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; -extern unsigned int sysctl_numa_balancing_scan_delay; -extern unsigned int sysctl_numa_balancing_scan_period_min; -extern unsigned int sysctl_numa_balancing_scan_period_max; -extern unsigned int sysctl_numa_balancing_scan_period_reset; -extern unsigned int sysctl_numa_balancing_scan_size; -extern unsigned int sysctl_numa_balancing_settle_count; +#define NUMA_BALANCING_DISABLED 0x0 +#define NUMA_BALANCING_NORMAL 0x1 +#define NUMA_BALANCING_MEMORY_TIERING 0x2 -#ifdef CONFIG_SCHED_DEBUG -extern unsigned int sysctl_sched_migration_cost; -extern unsigned int sysctl_sched_nr_migrate; -extern unsigned int sysctl_sched_time_avg; -extern unsigned int sysctl_timer_migration; -extern unsigned int sysctl_sched_shares_window; - -int sched_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, - loff_t *ppos); -#endif -#ifdef CONFIG_SCHED_DEBUG -static inline unsigned int get_sysctl_timer_migration(void) -{ - return sysctl_timer_migration; -} +#ifdef CONFIG_NUMA_BALANCING +extern int sysctl_numa_balancing_mode; #else -static inline unsigned int get_sysctl_timer_migration(void) -{ - return 1; -} -#endif - -/* - * control realtime throttling: - * - * /proc/sys/kernel/sched_rt_period_us - * /proc/sys/kernel/sched_rt_runtime_us - */ -extern unsigned int sysctl_sched_rt_period; -extern int sysctl_sched_rt_runtime; - -#ifdef CONFIG_CFS_BANDWIDTH -extern unsigned int sysctl_sched_cfs_bandwidth_slice; -#endif - -#ifdef CONFIG_SCHED_AUTOGROUP -extern unsigned int sysctl_sched_autogroup_enabled; +#define sysctl_numa_balancing_mode 0 #endif -extern int sched_rr_timeslice; - -extern int sched_rr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - -extern int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - -#endif /* _SCHED_SYSCTL_H */ +#endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h new file mode 100644 index 000000000000..525aa2a632b2 --- /dev/null +++ b/include/linux/sched/task.h @@ -0,0 +1,228 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_TASK_H +#define _LINUX_SCHED_TASK_H + +/* + * Interface between the scheduler and various task lifetime (fork()/exit()) + * functionality: + */ + +#include <linux/rcupdate.h> +#include <linux/refcount.h> +#include <linux/sched.h> +#include <linux/uaccess.h> + +struct task_struct; +struct rusage; +union thread_union; +struct css_set; + +/* All the bits taken by the old clone syscall. */ +#define CLONE_LEGACY_FLAGS 0xffffffffULL + +struct kernel_clone_args { + u64 flags; + int __user *pidfd; + int __user *child_tid; + int __user *parent_tid; + const char *name; + int exit_signal; + u32 kthread:1; + u32 io_thread:1; + u32 user_worker:1; + u32 no_files:1; + unsigned long stack; + unsigned long stack_size; + unsigned long tls; + pid_t *set_tid; + /* Number of elements in *set_tid */ + size_t set_tid_size; + int cgroup; + int idle; + int (*fn)(void *); + void *fn_arg; + struct cgroup *cgrp; + struct css_set *cset; + unsigned int kill_seq; +}; + +/* + * This serializes "schedule()" and also protects + * the run-queue from deletions/modifications (but + * _adding_ to the beginning of the run-queue has + * a separate lock). + */ +extern rwlock_t tasklist_lock; +extern spinlock_t mmlist_lock; + +extern union thread_union init_thread_union; +extern struct task_struct init_task; + +extern int lockdep_tasklist_lock_is_held(void); + +extern asmlinkage void schedule_tail(struct task_struct *prev); +extern void init_idle(struct task_struct *idle, int cpu); + +extern int sched_fork(u64 clone_flags, struct task_struct *p); +extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); +extern void sched_cancel_fork(struct task_struct *p); +extern void sched_post_fork(struct task_struct *p); +extern void sched_dead(struct task_struct *p); + +void __noreturn do_task_dead(void); +void __noreturn make_task_dead(int signr); + +extern void mm_cache_init(void); +extern void proc_caches_init(void); + +extern void fork_init(void); + +extern void release_task(struct task_struct * p); + +extern int copy_thread(struct task_struct *, const struct kernel_clone_args *); + +extern void flush_thread(void); + +#ifdef CONFIG_HAVE_EXIT_THREAD +extern void exit_thread(struct task_struct *tsk); +#else +static inline void exit_thread(struct task_struct *tsk) +{ +} +#endif +extern __noreturn void do_group_exit(int); + +extern void exit_files(struct task_struct *); +extern void exit_itimers(struct task_struct *); + +extern pid_t kernel_clone(struct kernel_clone_args *kargs); +struct task_struct *copy_process(struct pid *pid, int trace, int node, + struct kernel_clone_args *args); +struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node); +struct task_struct *fork_idle(int); +extern pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, + unsigned long flags); +extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags); +extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); +int kernel_wait(pid_t pid, int *stat); + +extern void free_task(struct task_struct *tsk); + +/* sched_exec is called by processes performing an exec */ +extern void sched_exec(void); + +static inline struct task_struct *get_task_struct(struct task_struct *t) +{ + refcount_inc(&t->usage); + return t; +} + +static inline struct task_struct *tryget_task_struct(struct task_struct *t) +{ + return refcount_inc_not_zero(&t->usage) ? t : NULL; +} + +extern void __put_task_struct(struct task_struct *t); +extern void __put_task_struct_rcu_cb(struct rcu_head *rhp); + +static inline void put_task_struct(struct task_struct *t) +{ + if (!refcount_dec_and_test(&t->usage)) + return; + + /* + * Under PREEMPT_RT, we can't call __put_task_struct + * in atomic context because it will indirectly + * acquire sleeping locks. The same is true if the + * current process has a mutex enqueued (blocked on + * a PI chain). + * + * In !RT, it is always safe to call __put_task_struct(). + * Though, in order to simplify the code, resort to the + * deferred call too. + * + * call_rcu() will schedule __put_task_struct_rcu_cb() + * to be called in process context. + * + * __put_task_struct() is called when + * refcount_dec_and_test(&t->usage) succeeds. + * + * This means that it can't "conflict" with + * put_task_struct_rcu_user() which abuses ->rcu the same + * way; rcu_users has a reference so task->usage can't be + * zero after rcu_users 1 -> 0 transition. + * + * delayed_free_task() also uses ->rcu, but it is only called + * when it fails to fork a process. Therefore, there is no + * way it can conflict with __put_task_struct(). + */ + call_rcu(&t->rcu, __put_task_struct_rcu_cb); +} + +DEFINE_FREE(put_task, struct task_struct *, if (_T) put_task_struct(_T)) + +static inline void put_task_struct_many(struct task_struct *t, int nr) +{ + if (refcount_sub_and_test(nr, &t->usage)) + __put_task_struct(t); +} + +void put_task_struct_rcu_user(struct task_struct *task); + +/* Free all architecture-specific resources held by a thread. */ +void release_thread(struct task_struct *dead_task); + +#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT +extern int arch_task_struct_size __read_mostly; +#else +# define arch_task_struct_size (sizeof(struct task_struct)) +#endif + +#ifndef CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST +/* + * If an architecture has not declared a thread_struct whitelist we + * must assume something there may need to be copied to userspace. + */ +static inline void arch_thread_struct_whitelist(unsigned long *offset, + unsigned long *size) +{ + *offset = 0; + /* Handle dynamically sized thread_struct. */ + *size = arch_task_struct_size - offsetof(struct task_struct, thread); +} +#endif + +#ifdef CONFIG_VMAP_STACK +static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) +{ + return t->stack_vm_area; +} +#else +static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) +{ + return NULL; +} +#endif + +/* + * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring + * subscriptions and synchronises with wait4(). Also used in procfs. Also + * pins the final release of task.io_context. Also protects ->cpuset and + * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist. + * + * Nests inside of read_lock(&tasklist_lock). It must not be nested with + * write_lock_irq(&tasklist_lock), neither inside nor outside. + */ +static inline void task_lock(struct task_struct *p) +{ + spin_lock(&p->alloc_lock); +} + +static inline void task_unlock(struct task_struct *p) +{ + spin_unlock(&p->alloc_lock); +} + +DEFINE_GUARD(task_lock, struct task_struct *, task_lock(_T), task_unlock(_T)) + +#endif /* _LINUX_SCHED_TASK_H */ diff --git a/include/linux/sched/task_flags.h b/include/linux/sched/task_flags.h new file mode 100644 index 000000000000..227f5be81bcd --- /dev/null +++ b/include/linux/sched/task_flags.h @@ -0,0 +1 @@ +#include <linux/sched.h> diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h new file mode 100644 index 000000000000..1fab7e9043a3 --- /dev/null +++ b/include/linux/sched/task_stack.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_TASK_STACK_H +#define _LINUX_SCHED_TASK_STACK_H + +/* + * task->stack (kernel stack) handling interfaces: + */ + +#include <linux/sched.h> +#include <linux/magic.h> +#include <linux/refcount.h> +#include <linux/kasan.h> + +#ifdef CONFIG_THREAD_INFO_IN_TASK + +/* + * When accessing the stack of a non-current task that might exit, use + * try_get_task_stack() instead. task_stack_page will return a pointer + * that could get freed out from under you. + */ +static __always_inline void *task_stack_page(const struct task_struct *task) +{ + return task->stack; +} + +#define setup_thread_stack(new,old) do { } while(0) + +static __always_inline unsigned long *end_of_stack(const struct task_struct *task) +{ +#ifdef CONFIG_STACK_GROWSUP + return (unsigned long *)((unsigned long)task->stack + THREAD_SIZE) - 1; +#else + return task->stack; +#endif +} + +#else + +#define task_stack_page(task) ((void *)(task)->stack) + +static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org) +{ + *task_thread_info(p) = *task_thread_info(org); + task_thread_info(p)->task = p; +} + +/* + * Return the address of the last usable long on the stack. + * + * When the stack grows down, this is just above the thread + * info struct. Going any lower will corrupt the threadinfo. + * + * When the stack grows up, this is the highest address. + * Beyond that position, we corrupt data on the next page. + */ +static inline unsigned long *end_of_stack(const struct task_struct *p) +{ +#ifdef CONFIG_STACK_GROWSUP + return (unsigned long *)((unsigned long)task_thread_info(p) + THREAD_SIZE) - 1; +#else + return (unsigned long *)(task_thread_info(p) + 1); +#endif +} + +#endif + +#ifdef CONFIG_THREAD_INFO_IN_TASK +static inline void *try_get_task_stack(struct task_struct *tsk) +{ + return refcount_inc_not_zero(&tsk->stack_refcount) ? + task_stack_page(tsk) : NULL; +} + +extern void put_task_stack(struct task_struct *tsk); +#else +static inline void *try_get_task_stack(struct task_struct *tsk) +{ + return task_stack_page(tsk); +} + +static inline void put_task_stack(struct task_struct *tsk) {} +#endif + +void exit_task_stack_account(struct task_struct *tsk); + +#define task_stack_end_corrupted(task) \ + (*(end_of_stack(task)) != STACK_END_MAGIC) + +static inline int object_is_on_stack(const void *obj) +{ + void *stack = task_stack_page(current); + + obj = kasan_reset_tag(obj); + return (obj >= stack) && (obj < (stack + THREAD_SIZE)); +} + +extern void thread_stack_cache_init(void); + +#ifdef CONFIG_DEBUG_STACK_USAGE +unsigned long stack_not_used(struct task_struct *p); +#else +static inline unsigned long stack_not_used(struct task_struct *p) +{ + return 0; +} +#endif +extern void set_task_stack_end_magic(struct task_struct *tsk); + +static inline int kstack_end(void *addr) +{ + /* Reliable end of stack detection: + * Some APM bios versions misalign the stack + */ + return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*))); +} + +#endif /* _LINUX_SCHED_TASK_STACK_H */ diff --git a/include/linux/sched/thread_info_api.h b/include/linux/sched/thread_info_api.h new file mode 100644 index 000000000000..2c60fbc16c08 --- /dev/null +++ b/include/linux/sched/thread_info_api.h @@ -0,0 +1 @@ +#include <linux/thread_info.h> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h new file mode 100644 index 000000000000..45c0022b91ce --- /dev/null +++ b/include/linux/sched/topology.h @@ -0,0 +1,247 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_TOPOLOGY_H +#define _LINUX_SCHED_TOPOLOGY_H + +#include <linux/topology.h> + +#include <linux/sched/idle.h> + +/* + * sched-domains (multiprocessor balancing) declarations: + */ + +/* Generate SD flag indexes */ +#define SD_FLAG(name, mflags) __##name, +enum { + #include <linux/sched/sd_flags.h> + __SD_FLAG_CNT, +}; +#undef SD_FLAG +/* Generate SD flag bits */ +#define SD_FLAG(name, mflags) name = 1 << __##name, +enum { + #include <linux/sched/sd_flags.h> +}; +#undef SD_FLAG + +struct sd_flag_debug { + unsigned int meta_flags; + char *name; +}; +extern const struct sd_flag_debug sd_flag_debug[]; + +struct sched_domain_topology_level; + +#ifdef CONFIG_SCHED_SMT +extern int cpu_smt_flags(void); +extern const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu); +#endif + +#ifdef CONFIG_SCHED_CLUSTER +extern int cpu_cluster_flags(void); +extern const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu); +#endif + +#ifdef CONFIG_SCHED_MC +extern int cpu_core_flags(void); +extern const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu); +#endif + +extern const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu); + +extern int arch_asym_cpu_priority(int cpu); + +struct sched_domain_attr { + int relax_domain_level; +}; + +#define SD_ATTR_INIT (struct sched_domain_attr) { \ + .relax_domain_level = -1, \ +} + +extern int sched_domain_level_max; + +struct sched_group; + +struct sched_domain_shared { + atomic_t ref; + atomic_t nr_busy_cpus; + int has_idle_cores; + int nr_idle_scan; +}; + +struct sched_domain { + /* These fields must be setup */ + struct sched_domain __rcu *parent; /* top domain must be null terminated */ + struct sched_domain __rcu *child; /* bottom domain must be null terminated */ + struct sched_group *groups; /* the balancing groups of the domain */ + unsigned long min_interval; /* Minimum balance interval ms */ + unsigned long max_interval; /* Maximum balance interval ms */ + unsigned int busy_factor; /* less balancing by factor if busy */ + unsigned int imbalance_pct; /* No balance until over watermark */ + unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ + unsigned int imb_numa_nr; /* Nr running tasks that allows a NUMA imbalance */ + + int nohz_idle; /* NOHZ IDLE status */ + int flags; /* See SD_* */ + int level; + + /* Runtime fields. */ + unsigned long last_balance; /* init to jiffies. units in jiffies */ + unsigned int balance_interval; /* initialise to 1. units in ms. */ + unsigned int nr_balance_failed; /* initialise to 0 */ + + /* idle_balance() stats */ + unsigned int newidle_call; + unsigned int newidle_success; + unsigned int newidle_ratio; + u64 max_newidle_lb_cost; + unsigned long last_decay_max_lb_cost; + +#ifdef CONFIG_SCHEDSTATS + /* sched_balance_rq() stats */ + unsigned int lb_count[CPU_MAX_IDLE_TYPES]; + unsigned int lb_failed[CPU_MAX_IDLE_TYPES]; + unsigned int lb_balanced[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_load[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_util[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_task[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_misfit[CPU_MAX_IDLE_TYPES]; + unsigned int lb_gained[CPU_MAX_IDLE_TYPES]; + unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES]; + unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES]; + unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES]; + + /* Active load balancing */ + unsigned int alb_count; + unsigned int alb_failed; + unsigned int alb_pushed; + + /* SD_BALANCE_EXEC stats */ + unsigned int sbe_count; + unsigned int sbe_balanced; + unsigned int sbe_pushed; + + /* SD_BALANCE_FORK stats */ + unsigned int sbf_count; + unsigned int sbf_balanced; + unsigned int sbf_pushed; + + /* try_to_wake_up() stats */ + unsigned int ttwu_wake_remote; + unsigned int ttwu_move_affine; + unsigned int ttwu_move_balance; +#endif + char *name; + union { + void *private; /* used during construction */ + struct rcu_head rcu; /* used during destruction */ + }; + struct sched_domain_shared *shared; + + unsigned int span_weight; + /* + * Span of all CPUs in this domain. + * + * NOTE: this field is variable length. (Allocated dynamically + * by attaching extra space to the end of the structure, + * depending on how many CPUs the kernel has booted up with) + */ + unsigned long span[]; +}; + +static inline struct cpumask *sched_domain_span(struct sched_domain *sd) +{ + return to_cpumask(sd->span); +} + +extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new); + +/* Allocate an array of sched domains, for partition_sched_domains(). */ +cpumask_var_t *alloc_sched_domains(unsigned int ndoms); +void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); + +bool cpus_equal_capacity(int this_cpu, int that_cpu); +bool cpus_share_cache(int this_cpu, int that_cpu); +bool cpus_share_resources(int this_cpu, int that_cpu); + +typedef const struct cpumask *(*sched_domain_mask_f)(struct sched_domain_topology_level *tl, int cpu); +typedef int (*sched_domain_flags_f)(void); + +struct sd_data { + struct sched_domain *__percpu *sd; + struct sched_domain_shared *__percpu *sds; + struct sched_group *__percpu *sg; + struct sched_group_capacity *__percpu *sgc; +}; + +struct sched_domain_topology_level { + sched_domain_mask_f mask; + sched_domain_flags_f sd_flags; + int numa_level; + struct sd_data data; + char *name; +}; + +extern void __init set_sched_topology(struct sched_domain_topology_level *tl); +extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio); + +#define SDTL_INIT(maskfn, flagsfn, dname) ((struct sched_domain_topology_level) \ + { .mask = maskfn, .sd_flags = flagsfn, .name = #dname }) + +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +extern void rebuild_sched_domains_energy(void); +#else +static inline void rebuild_sched_domains_energy(void) +{ +} +#endif + +#ifndef arch_scale_cpu_capacity +/** + * arch_scale_cpu_capacity - get the capacity scale factor of a given CPU. + * @cpu: the CPU in question. + * + * Return: the CPU scale factor normalized against SCHED_CAPACITY_SCALE, i.e. + * + * max_perf(cpu) + * ----------------------------- * SCHED_CAPACITY_SCALE + * max(max_perf(c) : c \in CPUs) + */ +static __always_inline +unsigned long arch_scale_cpu_capacity(int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif + +#ifndef arch_scale_hw_pressure +static __always_inline +unsigned long arch_scale_hw_pressure(int cpu) +{ + return 0; +} +#endif + +#ifndef arch_update_hw_pressure +static __always_inline +void arch_update_hw_pressure(const struct cpumask *cpus, + unsigned long capped_frequency) +{ } +#endif + +#ifndef arch_scale_freq_ref +static __always_inline +unsigned int arch_scale_freq_ref(int cpu) +{ + return 0; +} +#endif + +static inline int task_node(const struct task_struct *p) +{ + return cpu_to_node(task_cpu(p)); +} + +#endif /* _LINUX_SCHED_TOPOLOGY_H */ diff --git a/include/linux/sched/types.h b/include/linux/sched/types.h new file mode 100644 index 000000000000..969aaf5ef9d6 --- /dev/null +++ b/include/linux/sched/types.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_TYPES_H +#define _LINUX_SCHED_TYPES_H + +#include <linux/types.h> + +/** + * struct task_cputime - collected CPU time counts + * @stime: time spent in kernel mode, in nanoseconds + * @utime: time spent in user mode, in nanoseconds + * @sum_exec_runtime: total time spent on the CPU, in nanoseconds + * + * This structure groups together three kinds of CPU time that are tracked for + * threads and thread groups. Most things considering CPU time want to group + * these counts together and treat all three of them in parallel. + */ +struct task_cputime { + u64 stime; + u64 utime; + unsigned long long sum_exec_runtime; +}; + +#endif /* _LINUX_SCHED_TYPES_H */ diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h new file mode 100644 index 000000000000..4cc52698e214 --- /dev/null +++ b/include/linux/sched/user.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_USER_H +#define _LINUX_SCHED_USER_H + +#include <linux/uidgid.h> +#include <linux/atomic.h> +#include <linux/percpu_counter.h> +#include <linux/refcount.h> +#include <linux/ratelimit.h> + +/* + * Some day this will be a full-fledged user tracking system.. + */ +struct user_struct { + refcount_t __count; /* reference count */ +#ifdef CONFIG_EPOLL + struct percpu_counter epoll_watches; /* The number of file descriptors currently watched */ +#endif + unsigned long unix_inflight; /* How many files in flight in unix sockets */ + atomic_long_t pipe_bufs; /* how many pages are allocated in pipe buffers */ + + /* Hash table maintenance information */ + struct hlist_node uidhash_node; + kuid_t uid; + +#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \ + defined(CONFIG_NET) || defined(CONFIG_IO_URING) || \ + defined(CONFIG_VFIO_PCI_ZDEV_KVM) || IS_ENABLED(CONFIG_IOMMUFD) + atomic_long_t locked_vm; +#endif +#ifdef CONFIG_WATCH_QUEUE + atomic_t nr_watches; /* The number of watches this user currently has */ +#endif + + /* Miscellaneous per-user rate limit */ + struct ratelimit_state ratelimit; +}; + +extern int uids_sysfs_init(void); + +extern struct user_struct *find_user(kuid_t); + +extern struct user_struct root_user; +#define INIT_USER (&root_user) + + +/* per-UID process charging. */ +extern struct user_struct * alloc_uid(kuid_t); +static inline struct user_struct *get_uid(struct user_struct *u) +{ + refcount_inc(&u->__count); + return u; +} +extern void free_uid(struct user_struct *); + +#endif /* _LINUX_SCHED_USER_H */ diff --git a/include/linux/sched/vhost_task.h b/include/linux/sched/vhost_task.h new file mode 100644 index 000000000000..25446c5d3508 --- /dev/null +++ b/include/linux/sched/vhost_task.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_VHOST_TASK_H +#define _LINUX_SCHED_VHOST_TASK_H + +struct vhost_task; + +struct vhost_task *vhost_task_create(bool (*fn)(void *), + void (*handle_kill)(void *), void *arg, + const char *name); +void vhost_task_start(struct vhost_task *vtsk); +void vhost_task_stop(struct vhost_task *vtsk); +void vhost_task_wake(struct vhost_task *vtsk); + +#endif /* _LINUX_SCHED_VHOST_TASK_H */ diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h new file mode 100644 index 000000000000..0f28b4623ad4 --- /dev/null +++ b/include/linux/sched/wake_q.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_WAKE_Q_H +#define _LINUX_SCHED_WAKE_Q_H + +/* + * Wake-queues are lists of tasks with a pending wakeup, whose + * callers have already marked the task as woken internally, + * and can thus carry on. A common use case is being able to + * do the wakeups once the corresponding user lock as been + * released. + * + * We hold reference to each task in the list across the wakeup, + * thus guaranteeing that the memory is still valid by the time + * the actual wakeups are performed in wake_up_q(). + * + * One per task suffices, because there's never a need for a task to be + * in two wake queues simultaneously; it is forbidden to abandon a task + * in a wake queue (a call to wake_up_q() _must_ follow), so if a task is + * already in a wake queue, the wakeup will happen soon and the second + * waker can just skip it. + * + * The DEFINE_WAKE_Q macro declares and initializes the list head. + * wake_up_q() does NOT reinitialize the list; it's expected to be + * called near the end of a function. Otherwise, the list can be + * re-initialized for later re-use by wake_q_init(). + * + * NOTE that this can cause spurious wakeups. schedule() callers + * must ensure the call is done inside a loop, confirming that the + * wakeup condition has in fact occurred. + * + * NOTE that there is no guarantee the wakeup will happen any later than the + * wake_q_add() location. Therefore task must be ready to be woken at the + * location of the wake_q_add(). + */ + +#include <linux/sched.h> + +struct wake_q_head { + struct wake_q_node *first; + struct wake_q_node **lastp; +}; + +#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01) + +#define WAKE_Q_HEAD_INITIALIZER(name) \ + { WAKE_Q_TAIL, &name.first } + +#define DEFINE_WAKE_Q(name) \ + struct wake_q_head name = WAKE_Q_HEAD_INITIALIZER(name) + +static inline void wake_q_init(struct wake_q_head *head) +{ + head->first = WAKE_Q_TAIL; + head->lastp = &head->first; +} + +static inline bool wake_q_empty(struct wake_q_head *head) +{ + return head->first == WAKE_Q_TAIL; +} + +extern void wake_q_add(struct wake_q_head *head, struct task_struct *task); +extern void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task); +extern void wake_up_q(struct wake_q_head *head); + +/* Spin unlock helpers to unlock and call wake_up_q with preempt disabled */ +static inline +void raw_spin_unlock_wake(raw_spinlock_t *lock, struct wake_q_head *wake_q) +{ + guard(preempt)(); + raw_spin_unlock(lock); + if (wake_q) { + wake_up_q(wake_q); + wake_q_init(wake_q); + } +} + +static inline +void raw_spin_unlock_irq_wake(raw_spinlock_t *lock, struct wake_q_head *wake_q) +{ + guard(preempt)(); + raw_spin_unlock_irq(lock); + if (wake_q) { + wake_up_q(wake_q); + wake_q_init(wake_q); + } +} + +static inline +void raw_spin_unlock_irqrestore_wake(raw_spinlock_t *lock, unsigned long flags, + struct wake_q_head *wake_q) +{ + guard(preempt)(); + raw_spin_unlock_irqrestore(lock, flags); + if (wake_q) { + wake_up_q(wake_q); + wake_q_init(wake_q); + } +} +#endif /* _LINUX_SCHED_WAKE_Q_H */ diff --git a/include/linux/sched/xacct.h b/include/linux/sched/xacct.h new file mode 100644 index 000000000000..c078f0a94cec --- /dev/null +++ b/include/linux/sched/xacct.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_XACCT_H +#define _LINUX_SCHED_XACCT_H + +/* + * Extended task accounting methods: + */ + +#include <linux/sched.h> + +#ifdef CONFIG_TASK_XACCT +static inline void add_rchar(struct task_struct *tsk, ssize_t amt) +{ + tsk->ioac.rchar += amt; +} + +static inline void add_wchar(struct task_struct *tsk, ssize_t amt) +{ + tsk->ioac.wchar += amt; +} + +static inline void inc_syscr(struct task_struct *tsk) +{ + tsk->ioac.syscr++; +} + +static inline void inc_syscw(struct task_struct *tsk) +{ + tsk->ioac.syscw++; +} +#else +static inline void add_rchar(struct task_struct *tsk, ssize_t amt) +{ +} + +static inline void add_wchar(struct task_struct *tsk, ssize_t amt) +{ +} + +static inline void inc_syscr(struct task_struct *tsk) +{ +} + +static inline void inc_syscw(struct task_struct *tsk) +{ +} +#endif + +#endif /* _LINUX_SCHED_XACCT_H */ |
