diff options
Diffstat (limited to 'include/linux/sched')
-rw-r--r-- | include/linux/sched/coredump.h | 79 | ||||
-rw-r--r-- | include/linux/sched/deadline.h | 18 | ||||
-rw-r--r-- | include/linux/sched/debug.h | 2 | ||||
-rw-r--r-- | include/linux/sched/ext.h | 217 | ||||
-rw-r--r-- | include/linux/sched/hotplug.h | 4 | ||||
-rw-r--r-- | include/linux/sched/idle.h | 25 | ||||
-rw-r--r-- | include/linux/sched/isolation.h | 21 | ||||
-rw-r--r-- | include/linux/sched/mm.h | 55 | ||||
-rw-r--r-- | include/linux/sched/prio.h | 1 | ||||
-rw-r--r-- | include/linux/sched/rt.h | 33 | ||||
-rw-r--r-- | include/linux/sched/signal.h | 14 | ||||
-rw-r--r-- | include/linux/sched/smt.h | 2 | ||||
-rw-r--r-- | include/linux/sched/task.h | 9 | ||||
-rw-r--r-- | include/linux/sched/task_stack.h | 24 | ||||
-rw-r--r-- | include/linux/sched/topology.h | 43 | ||||
-rw-r--r-- | include/linux/sched/vhost_task.h | 3 | ||||
-rw-r--r-- | include/linux/sched/wake_q.h | 34 |
17 files changed, 392 insertions, 192 deletions
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 02f5090ffea2..6eb65ceed213 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -8,12 +8,6 @@ #define SUID_DUMP_USER 1 /* Dump as user of process */ #define SUID_DUMP_ROOT 2 /* Dump as root */ -/* mm flags */ - -/* for SUID_DUMP_* above */ -#define MMF_DUMPABLE_BITS 2 -#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) - extern void set_dumpable(struct mm_struct *mm, int value); /* * This returns the actual value of the suid_dumpable flag. For things @@ -31,77 +25,4 @@ static inline int get_dumpable(struct mm_struct *mm) return __get_dumpable(mm->flags); } -/* coredump filter bits */ -#define MMF_DUMP_ANON_PRIVATE 2 -#define MMF_DUMP_ANON_SHARED 3 -#define MMF_DUMP_MAPPED_PRIVATE 4 -#define MMF_DUMP_MAPPED_SHARED 5 -#define MMF_DUMP_ELF_HEADERS 6 -#define MMF_DUMP_HUGETLB_PRIVATE 7 -#define MMF_DUMP_HUGETLB_SHARED 8 -#define MMF_DUMP_DAX_PRIVATE 9 -#define MMF_DUMP_DAX_SHARED 10 - -#define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS -#define MMF_DUMP_FILTER_BITS 9 -#define MMF_DUMP_FILTER_MASK \ - (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) -#define MMF_DUMP_FILTER_DEFAULT \ - ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\ - (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) - -#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS -# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS) -#else -# define MMF_DUMP_MASK_DEFAULT_ELF 0 -#endif - /* leave room for more dump flags */ -#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ -#define MMF_VM_HUGEPAGE 17 /* set when mm is available for - khugepaged */ -/* - * This one-shot flag is dropped due to necessity of changing exe once again - * on NFS restore - */ -//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ - -#define MMF_HAS_UPROBES 19 /* has uprobes */ -#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ -#define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ -#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ -#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ -#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ -#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) -#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ -#define MMF_MULTIPROCESS 26 /* mm is shared between processes */ -/* - * MMF_HAS_PINNED: Whether this mm has pinned any pages. This can be either - * replaced in the future by mm.pinned_vm when it becomes stable, or grow into - * a counter on its own. We're aggresive on this bit for now: even if the - * pinned pages were unpinned later on, we'll still keep this bit set for the - * lifecycle of this mm, just for simplicity. - */ -#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ - -#define MMF_HAS_MDWE 28 -#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE) - - -#define MMF_HAS_MDWE_NO_INHERIT 29 - -#define MMF_VM_MERGE_ANY 30 -#define MMF_VM_MERGE_ANY_MASK (1 << MMF_VM_MERGE_ANY) - -#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ - MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ - MMF_VM_MERGE_ANY_MASK) - -static inline unsigned long mmf_init_flags(unsigned long flags) -{ - if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT)) - flags &= ~((1UL << MMF_HAS_MDWE) | - (1UL << MMF_HAS_MDWE_NO_INHERIT)); - return flags & MMF_INIT_MASK; -} - #endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index df3aca89d4f5..f9aabbc9d22e 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -10,16 +10,16 @@ #include <linux/sched.h> -#define MAX_DL_PRIO 0 - -static inline int dl_prio(int prio) +static inline bool dl_prio(int prio) { - if (unlikely(prio < MAX_DL_PRIO)) - return 1; - return 0; + return unlikely(prio < MAX_DL_PRIO); } -static inline int dl_task(struct task_struct *p) +/* + * Returns true if a task has a priority that belongs to DL class. PI-boosted + * tasks will return true. Use dl_policy() to ignore PI-boosted tasks. + */ +static inline bool dl_task(struct task_struct *p) { return dl_prio(p->prio); } @@ -34,7 +34,11 @@ static inline bool dl_time_before(u64 a, u64 b) struct root_domain; extern void dl_add_task_root_domain(struct task_struct *p); extern void dl_clear_root_domain(struct root_domain *rd); +extern void dl_clear_root_domain_cpu(int cpu); #endif /* CONFIG_SMP */ +extern u64 dl_cookie; +extern bool dl_bw_visited(int cpu, u64 cookie); + #endif /* _LINUX_SCHED_DEADLINE_H */ diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h index b5035afa2396..35ed4577a6cc 100644 --- a/include/linux/sched/debug.h +++ b/include/linux/sched/debug.h @@ -35,12 +35,10 @@ extern void show_stack(struct task_struct *task, unsigned long *sp, extern void sched_show_task(struct task_struct *p); -#ifdef CONFIG_SCHED_DEBUG struct seq_file; extern void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, struct seq_file *m); extern void proc_sched_set_task(struct task_struct *p); -#endif /* Attach to any functions which should be ignored in wchan output. */ #define __sched __section(".sched.text") diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h new file mode 100644 index 000000000000..f7545430a548 --- /dev/null +++ b/include/linux/sched/ext.h @@ -0,0 +1,217 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#ifndef _LINUX_SCHED_EXT_H +#define _LINUX_SCHED_EXT_H + +#ifdef CONFIG_SCHED_CLASS_EXT + +#include <linux/llist.h> +#include <linux/rhashtable-types.h> + +enum scx_public_consts { + SCX_OPS_NAME_LEN = 128, + + SCX_SLICE_DFL = 20 * 1000000, /* 20ms */ + SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */ +}; + +/* + * DSQ (dispatch queue) IDs are 64bit of the format: + * + * Bits: [63] [62 .. 0] + * [ B] [ ID ] + * + * B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs + * ID: 63 bit ID + * + * Built-in IDs: + * + * Bits: [63] [62] [61..32] [31 .. 0] + * [ 1] [ L] [ R ] [ V ] + * + * 1: 1 for built-in DSQs. + * L: 1 for LOCAL_ON DSQ IDs, 0 for others + * V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value. + */ +enum scx_dsq_id_flags { + SCX_DSQ_FLAG_BUILTIN = 1LLU << 63, + SCX_DSQ_FLAG_LOCAL_ON = 1LLU << 62, + + SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0, + SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1, + SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2, + SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON, + SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, +}; + +/* + * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered + * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to + * buffer between the scheduler core and the BPF scheduler. See the + * documentation for more details. + */ +struct scx_dispatch_q { + raw_spinlock_t lock; + struct list_head list; /* tasks in dispatch order */ + struct rb_root priq; /* used to order by p->scx.dsq_vtime */ + u32 nr; + u32 seq; /* used by BPF iter */ + u64 id; + struct rhash_head hash_node; + struct llist_node free_node; + struct rcu_head rcu; +}; + +/* scx_entity.flags */ +enum scx_ent_flags { + SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */ + SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ + SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ + + SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */ + SCX_TASK_STATE_BITS = 2, + SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, + + SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */ +}; + +/* scx_entity.flags & SCX_TASK_STATE_MASK */ +enum scx_task_state { + SCX_TASK_NONE, /* ops.init_task() not called yet */ + SCX_TASK_INIT, /* ops.init_task() succeeded, but task can be cancelled */ + SCX_TASK_READY, /* fully initialized, but not in sched_ext */ + SCX_TASK_ENABLED, /* fully initialized and in sched_ext */ + + SCX_TASK_NR_STATES, +}; + +/* scx_entity.dsq_flags */ +enum scx_ent_dsq_flags { + SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* task is queued on the priority queue of a dsq */ +}; + +/* + * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from + * everywhere and the following bits track which kfunc sets are currently + * allowed for %current. This simple per-task tracking works because SCX ops + * nest in a limited way. BPF will likely implement a way to allow and disallow + * kfuncs depending on the calling context which will replace this manual + * mechanism. See scx_kf_allow(). + */ +enum scx_kf_mask { + SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */ + /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ + SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */ + /* ops.dequeue (in REST) may be nested inside DISPATCH */ + SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */ + SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */ + SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */ + SCX_KF_REST = 1 << 4, /* other rq-locked operations */ + + __SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH | + SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, + __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, +}; + +enum scx_dsq_lnode_flags { + SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0, + + /* high 16 bits can be for iter cursor flags */ + __SCX_DSQ_LNODE_PRIV_SHIFT = 16, +}; + +struct scx_dsq_list_node { + struct list_head node; + u32 flags; + u32 priv; /* can be used by iter cursor */ +}; + +/* + * The following is embedded in task_struct and contains all fields necessary + * for a task to be scheduled by SCX. + */ +struct sched_ext_entity { + struct scx_dispatch_q *dsq; + struct scx_dsq_list_node dsq_list; /* dispatch order */ + struct rb_node dsq_priq; /* p->scx.dsq_vtime order */ + u32 dsq_seq; + u32 dsq_flags; /* protected by DSQ lock */ + u32 flags; /* protected by rq lock */ + u32 weight; + s32 sticky_cpu; + s32 holding_cpu; + s32 selected_cpu; + u32 kf_mask; /* see scx_kf_mask above */ + struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ + atomic_long_t ops_state; + + struct list_head runnable_node; /* rq->scx.runnable_list */ + unsigned long runnable_at; + +#ifdef CONFIG_SCHED_CORE + u64 core_sched_at; /* see scx_prio_less() */ +#endif + u64 ddsp_dsq_id; + u64 ddsp_enq_flags; + + /* BPF scheduler modifiable fields */ + + /* + * Runtime budget in nsecs. This is usually set through + * scx_bpf_dispatch() but can also be modified directly by the BPF + * scheduler. Automatically decreased by SCX as the task executes. On + * depletion, a scheduling event is triggered. + * + * This value is cleared to zero if the task is preempted by + * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the + * task ran. Use p->se.sum_exec_runtime instead. + */ + u64 slice; + + /* + * Used to order tasks when dispatching to the vtime-ordered priority + * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime() + * but can also be modified directly by the BPF scheduler. Modifying it + * while a task is queued on a dsq may mangle the ordering and is not + * recommended. + */ + u64 dsq_vtime; + + /* + * If set, reject future sched_setscheduler(2) calls updating the policy + * to %SCHED_EXT with -%EACCES. + * + * Can be set from ops.init_task() while the BPF scheduler is being + * loaded (!scx_init_task_args->fork). If set and the task's policy is + * already %SCHED_EXT, the task's policy is rejected and forcefully + * reverted to %SCHED_NORMAL. The number of such events are reported + * through /sys/kernel/debug/sched_ext::nr_rejected. Setting this flag + * during fork is not allowed. + */ + bool disallow; /* reject switching into SCX */ + + /* cold fields */ +#ifdef CONFIG_EXT_GROUP_SCHED + struct cgroup *cgrp_moving_from; +#endif + struct list_head tasks_node; +}; + +void sched_ext_free(struct task_struct *p); +void print_scx_info(const char *log_lvl, struct task_struct *p); +void scx_softlockup(u32 dur_s); + +#else /* !CONFIG_SCHED_CLASS_EXT */ + +static inline void sched_ext_free(struct task_struct *p) {} +static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} +static inline void scx_softlockup(u32 dur_s) {} + +#endif /* CONFIG_SCHED_CLASS_EXT */ +#endif /* _LINUX_SCHED_EXT_H */ diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h index 412cdaba33eb..17e04859b9a4 100644 --- a/include/linux/sched/hotplug.h +++ b/include/linux/sched/hotplug.h @@ -18,10 +18,6 @@ extern int sched_cpu_dying(unsigned int cpu); # define sched_cpu_dying NULL #endif -#ifdef CONFIG_HOTPLUG_CPU -extern void idle_task_exit(void); -#else static inline void idle_task_exit(void) {} -#endif #endif /* _LINUX_SCHED_HOTPLUG_H */ diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h index 478084f9105e..439f6029d3b9 100644 --- a/include/linux/sched/idle.h +++ b/include/linux/sched/idle.h @@ -5,8 +5,8 @@ #include <linux/sched.h> enum cpu_idle_type { + __CPU_NOT_IDLE = 0, CPU_IDLE, - CPU_NOT_IDLE, CPU_NEWLY_IDLE, CPU_MAX_IDLE_TYPES }; @@ -79,6 +79,21 @@ static __always_inline bool __must_check current_clr_polling_and_test(void) return unlikely(tif_need_resched()); } +static __always_inline void current_clr_polling(void) +{ + __current_clr_polling(); + + /* + * Ensure we check TIF_NEED_RESCHED after we clear the polling bit. + * Once the bit is cleared, we'll get IPIs with every new + * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also + * fold. + */ + smp_mb__after_atomic(); /* paired with resched_curr() */ + + preempt_fold_need_resched(); +} + #else static inline void __current_set_polling(void) { } static inline void __current_clr_polling(void) { } @@ -91,21 +106,15 @@ static inline bool __must_check current_clr_polling_and_test(void) { return unlikely(tif_need_resched()); } -#endif static __always_inline void current_clr_polling(void) { __current_clr_polling(); - /* - * Ensure we check TIF_NEED_RESCHED after we clear the polling bit. - * Once the bit is cleared, we'll get IPIs with every new - * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also - * fold. - */ smp_mb(); /* paired with resched_curr() */ preempt_fold_need_resched(); } +#endif #endif /* _LINUX_SCHED_IDLE_H */ diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index 2b461129d1fa..d8501f4709b5 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -7,16 +7,21 @@ #include <linux/tick.h> enum hk_type { - HK_TYPE_TIMER, - HK_TYPE_RCU, - HK_TYPE_MISC, - HK_TYPE_SCHED, - HK_TYPE_TICK, HK_TYPE_DOMAIN, - HK_TYPE_WQ, HK_TYPE_MANAGED_IRQ, - HK_TYPE_KTHREAD, - HK_TYPE_MAX + HK_TYPE_KERNEL_NOISE, + HK_TYPE_MAX, + + /* + * The following housekeeping types are only set by the nohz_full + * boot commandline option. So they can share the same value. + */ + HK_TYPE_TICK = HK_TYPE_KERNEL_NOISE, + HK_TYPE_TIMER = HK_TYPE_KERNEL_NOISE, + HK_TYPE_RCU = HK_TYPE_KERNEL_NOISE, + HK_TYPE_MISC = HK_TYPE_KERNEL_NOISE, + HK_TYPE_WQ = HK_TYPE_KERNEL_NOISE, + HK_TYPE_KTHREAD = HK_TYPE_KERNEL_NOISE }; #ifdef CONFIG_CPU_ISOLATION diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index b6543f9d78d6..b13474825130 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -8,6 +8,7 @@ #include <linux/mm_types.h> #include <linux/gfp.h> #include <linux/sync_core.h> +#include <linux/sched/coredump.h> /* * Routines for handling mm_structs @@ -178,22 +179,36 @@ static inline void mm_update_next_owner(struct mm_struct *mm) extern void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack); -extern unsigned long -arch_get_unmapped_area(struct file *, unsigned long, unsigned long, - unsigned long, unsigned long); -extern unsigned long + +unsigned long +arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags); +unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags); + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t); + +unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags); + +unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, + struct file *filp, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags, + vm_flags_t vm_flags); unsigned long generic_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags); + unsigned long flags, vm_flags_t vm_flags); unsigned long generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags); + unsigned long flags, vm_flags_t vm_flags); #else static inline void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) {} @@ -236,25 +251,16 @@ static inline gfp_t current_gfp_context(gfp_t flags) { unsigned int pflags = READ_ONCE(current->flags); - if (unlikely(pflags & (PF_MEMALLOC_NOIO | - PF_MEMALLOC_NOFS | - PF_MEMALLOC_NORECLAIM | - PF_MEMALLOC_NOWARN | - PF_MEMALLOC_PIN))) { + if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) { /* - * Stronger flags before weaker flags: - * NORECLAIM implies NOIO, which in turn implies NOFS + * NOIO implies both NOIO and NOFS and it is a weaker context + * so always make sure it makes precedence */ - if (pflags & PF_MEMALLOC_NORECLAIM) - flags &= ~__GFP_DIRECT_RECLAIM; - else if (pflags & PF_MEMALLOC_NOIO) + if (pflags & PF_MEMALLOC_NOIO) flags &= ~(__GFP_IO | __GFP_FS); else if (pflags & PF_MEMALLOC_NOFS) flags &= ~__GFP_FS; - if (pflags & PF_MEMALLOC_NOWARN) - flags |= __GFP_NOWARN; - if (pflags & PF_MEMALLOC_PIN) flags &= ~__GFP_MOVABLE; } @@ -525,6 +531,13 @@ enum { static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) { + /* + * The atomic_read() below prevents CSE. The following should + * help the compiler generate more efficient code on architectures + * where sync_core_before_usermode() is a no-op. + */ + if (!IS_ENABLED(CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE)) + return; if (current->mm != mm) return; if (likely(!(atomic_read(&mm->membarrier_state) & diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h index ab83d85e1183..6ab43b4f72f9 100644 --- a/include/linux/sched/prio.h +++ b/include/linux/sched/prio.h @@ -14,6 +14,7 @@ */ #define MAX_RT_PRIO 100 +#define MAX_DL_PRIO 0 #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index b2b9e6eb9683..4e3338103654 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -6,19 +6,40 @@ struct task_struct; -static inline int rt_prio(int prio) +static inline bool rt_prio(int prio) { - if (unlikely(prio < MAX_RT_PRIO)) - return 1; - return 0; + return unlikely(prio < MAX_RT_PRIO && prio >= MAX_DL_PRIO); } -static inline int rt_task(struct task_struct *p) +static inline bool rt_or_dl_prio(int prio) +{ + return unlikely(prio < MAX_RT_PRIO); +} + +/* + * Returns true if a task has a priority that belongs to RT class. PI-boosted + * tasks will return true. Use rt_policy() to ignore PI-boosted tasks. + */ +static inline bool rt_task(struct task_struct *p) { return rt_prio(p->prio); } -static inline bool task_is_realtime(struct task_struct *tsk) +/* + * Returns true if a task has a priority that belongs to RT or DL classes. + * PI-boosted tasks will return true. Use rt_or_dl_task_policy() to ignore + * PI-boosted tasks. + */ +static inline bool rt_or_dl_task(struct task_struct *p) +{ + return rt_or_dl_prio(p->prio); +} + +/* + * Returns true if a task has a policy that belongs to RT or DL classes. + * PI-boosted tasks will return false. + */ +static inline bool rt_or_dl_task_policy(struct task_struct *tsk) { int policy = tsk->policy; diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 0a0e23c45406..1ef1edbaaf79 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -136,8 +136,10 @@ struct signal_struct { #ifdef CONFIG_POSIX_TIMERS /* POSIX.1b Interval Timers */ - unsigned int next_posix_timer_id; - struct list_head posix_timers; + unsigned int timer_create_restore_ids:1; + atomic_t next_posix_timer_id; + struct hlist_head posix_timers; + struct hlist_head ignored_posix_timers; /* ITIMER_REAL timer for the process */ struct hrtimer real_timer; @@ -276,8 +278,7 @@ static inline void signal_set_stop_flags(struct signal_struct *sig, extern void flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); extern void flush_signal_handlers(struct task_struct *, int force_default); -extern int dequeue_signal(struct task_struct *task, sigset_t *mask, - kernel_siginfo_t *info, enum pid_type *type); +extern int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type); static inline int kernel_dequeue_signal(void) { @@ -287,7 +288,7 @@ static inline int kernel_dequeue_signal(void) int ret; spin_lock_irq(&task->sighand->siglock); - ret = dequeue_signal(task, &task->blocked, &__info, &__type); + ret = dequeue_signal(&task->blocked, &__info, &__type); spin_unlock_irq(&task->sighand->siglock); return ret; @@ -339,9 +340,6 @@ extern void force_fatal_sig(int); extern void force_exit_sig(int); extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); -extern struct sigqueue *sigqueue_alloc(void); -extern void sigqueue_free(struct sigqueue *); -extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); static inline void clear_notify_signal(void) diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h index fb1e295e7e63..166b19af956f 100644 --- a/include/linux/sched/smt.h +++ b/include/linux/sched/smt.h @@ -12,7 +12,7 @@ static __always_inline bool sched_smt_active(void) return static_branch_likely(&sched_smt_present); } #else -static inline bool sched_smt_active(void) { return false; } +static __always_inline bool sched_smt_active(void) { return false; } #endif void arch_smt_update(void); diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index d362aacf9f89..ca1db4b92c32 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -43,6 +43,7 @@ struct kernel_clone_args { void *fn_arg; struct cgroup *cgrp; struct css_set *cset; + unsigned int kill_seq; }; /* @@ -63,7 +64,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); -extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); +extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); +extern void sched_cancel_fork(struct task_struct *p); extern void sched_post_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); @@ -119,6 +121,11 @@ static inline struct task_struct *get_task_struct(struct task_struct *t) return t; } +static inline struct task_struct *tryget_task_struct(struct task_struct *t) +{ + return refcount_inc_not_zero(&t->usage) ? t : NULL; +} + extern void __put_task_struct(struct task_struct *t); extern void __put_task_struct_rcu_cb(struct rcu_head *rhp); diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h index ccd72b978e1f..85c5a6392e02 100644 --- a/include/linux/sched/task_stack.h +++ b/include/linux/sched/task_stack.h @@ -9,6 +9,7 @@ #include <linux/sched.h> #include <linux/magic.h> #include <linux/refcount.h> +#include <linux/kasan.h> #ifdef CONFIG_THREAD_INFO_IN_TASK @@ -33,7 +34,7 @@ static __always_inline unsigned long *end_of_stack(const struct task_struct *tas #endif } -#elif !defined(__HAVE_THREAD_FUNCTIONS) +#else #define task_stack_page(task) ((void *)(task)->stack) @@ -89,34 +90,22 @@ static inline int object_is_on_stack(const void *obj) { void *stack = task_stack_page(current); + obj = kasan_reset_tag(obj); return (obj >= stack) && (obj < (stack + THREAD_SIZE)); } extern void thread_stack_cache_init(void); #ifdef CONFIG_DEBUG_STACK_USAGE +unsigned long stack_not_used(struct task_struct *p); +#else static inline unsigned long stack_not_used(struct task_struct *p) { - unsigned long *n = end_of_stack(p); - - do { /* Skip over canary */ -# ifdef CONFIG_STACK_GROWSUP - n--; -# else - n++; -# endif - } while (!*n); - -# ifdef CONFIG_STACK_GROWSUP - return (unsigned long)end_of_stack(p) - (unsigned long)n; -# else - return (unsigned long)n - (unsigned long)end_of_stack(p); -# endif + return 0; } #endif extern void set_task_stack_end_magic(struct task_struct *tsk); -#ifndef __HAVE_ARCH_KSTACK_END static inline int kstack_end(void *addr) { /* Reliable end of stack detection: @@ -124,6 +113,5 @@ static inline int kstack_end(void *addr) */ return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*))); } -#endif #endif /* _LINUX_SCHED_TASK_STACK_H */ diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 18572c9ea724..198bb5cc1774 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -25,16 +25,12 @@ enum { }; #undef SD_FLAG -#ifdef CONFIG_SCHED_DEBUG - struct sd_flag_debug { unsigned int meta_flags; char *name; }; extern const struct sd_flag_debug sd_flag_debug[]; -#endif - #ifdef CONFIG_SCHED_SMT static inline int cpu_smt_flags(void) { @@ -110,11 +106,14 @@ struct sched_domain { unsigned long last_decay_max_lb_cost; #ifdef CONFIG_SCHEDSTATS - /* load_balance() stats */ + /* sched_balance_rq() stats */ unsigned int lb_count[CPU_MAX_IDLE_TYPES]; unsigned int lb_failed[CPU_MAX_IDLE_TYPES]; unsigned int lb_balanced[CPU_MAX_IDLE_TYPES]; - unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_load[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_util[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_task[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_misfit[CPU_MAX_IDLE_TYPES]; unsigned int lb_gained[CPU_MAX_IDLE_TYPES]; unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES]; unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES]; @@ -140,9 +139,7 @@ struct sched_domain { unsigned int ttwu_move_affine; unsigned int ttwu_move_balance; #endif -#ifdef CONFIG_SCHED_DEBUG char *name; -#endif union { void *private; /* used during construction */ struct rcu_head rcu; /* used during destruction */ @@ -165,10 +162,6 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd) return to_cpumask(sd->span); } -extern void partition_sched_domains_locked(int ndoms_new, - cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new); - extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new); @@ -198,30 +191,20 @@ struct sched_domain_topology_level { int flags; int numa_level; struct sd_data data; -#ifdef CONFIG_SCHED_DEBUG char *name; -#endif }; extern void __init set_sched_topology(struct sched_domain_topology_level *tl); +extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio); + -#ifdef CONFIG_SCHED_DEBUG # define SD_INIT_NAME(type) .name = #type -#else -# define SD_INIT_NAME(type) -#endif #else /* CONFIG_SMP */ struct sched_domain_attr; static inline void -partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) -{ -} - -static inline void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { @@ -242,6 +225,10 @@ static inline bool cpus_share_resources(int this_cpu, int that_cpu) return true; } +static inline void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) +{ +} + #endif /* !CONFIG_SMP */ #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) @@ -270,17 +257,17 @@ unsigned long arch_scale_cpu_capacity(int cpu) } #endif -#ifndef arch_scale_thermal_pressure +#ifndef arch_scale_hw_pressure static __always_inline -unsigned long arch_scale_thermal_pressure(int cpu) +unsigned long arch_scale_hw_pressure(int cpu) { return 0; } #endif -#ifndef arch_update_thermal_pressure +#ifndef arch_update_hw_pressure static __always_inline -void arch_update_thermal_pressure(const struct cpumask *cpus, +void arch_update_hw_pressure(const struct cpumask *cpus, unsigned long capped_frequency) { } #endif diff --git a/include/linux/sched/vhost_task.h b/include/linux/sched/vhost_task.h index bc60243d43b3..25446c5d3508 100644 --- a/include/linux/sched/vhost_task.h +++ b/include/linux/sched/vhost_task.h @@ -4,7 +4,8 @@ struct vhost_task; -struct vhost_task *vhost_task_create(bool (*fn)(void *), void *arg, +struct vhost_task *vhost_task_create(bool (*fn)(void *), + void (*handle_kill)(void *), void *arg, const char *name); void vhost_task_start(struct vhost_task *vtsk); void vhost_task_stop(struct vhost_task *vtsk); diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h index 06cd8fb2f409..0f28b4623ad4 100644 --- a/include/linux/sched/wake_q.h +++ b/include/linux/sched/wake_q.h @@ -63,4 +63,38 @@ extern void wake_q_add(struct wake_q_head *head, struct task_struct *task); extern void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task); extern void wake_up_q(struct wake_q_head *head); +/* Spin unlock helpers to unlock and call wake_up_q with preempt disabled */ +static inline +void raw_spin_unlock_wake(raw_spinlock_t *lock, struct wake_q_head *wake_q) +{ + guard(preempt)(); + raw_spin_unlock(lock); + if (wake_q) { + wake_up_q(wake_q); + wake_q_init(wake_q); + } +} + +static inline +void raw_spin_unlock_irq_wake(raw_spinlock_t *lock, struct wake_q_head *wake_q) +{ + guard(preempt)(); + raw_spin_unlock_irq(lock); + if (wake_q) { + wake_up_q(wake_q); + wake_q_init(wake_q); + } +} + +static inline +void raw_spin_unlock_irqrestore_wake(raw_spinlock_t *lock, unsigned long flags, + struct wake_q_head *wake_q) +{ + guard(preempt)(); + raw_spin_unlock_irqrestore(lock, flags); + if (wake_q) { + wake_up_q(wake_q); + wake_q_init(wake_q); + } +} #endif /* _LINUX_SCHED_WAKE_Q_H */ |