From c5895d3f06cbb80ccb311f1dcb37074651030cb6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 May 2021 22:43:42 +0200 Subject: sched: Simplify sched_info_on() The situation around sched_info is somewhat complicated, it is used by sched_stats and delayacct and, indirectly, kvm. If SCHEDSTATS=Y (but disabled by default) sched_info_on() is unconditionally true -- this is the case for all distro kernel configs I checked. If for some reason SCHEDSTATS=N, but TASK_DELAY_ACCT=Y, then sched_info_on() can return false when delayacct is disabled, presumably because there would be no other users left; except kvm is. Instead of complicating matters further by accurately accounting sched_stat and kvm state, simply unconditionally enable when SCHED_INFO=Y, matching the common distro case. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/20210505111525.121458839@infradead.org --- include/linux/sched/stat.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h index 568286411b43..939c3ec9e1b9 100644 --- a/include/linux/sched/stat.h +++ b/include/linux/sched/stat.h @@ -3,6 +3,7 @@ #define _LINUX_SCHED_STAT_H #include +#include /* * Various counters maintained by the scheduler and fork(), @@ -23,14 +24,7 @@ extern unsigned long nr_iowait_cpu(int cpu); static inline int sched_info_on(void) { -#ifdef CONFIG_SCHEDSTATS - return 1; -#elif defined(CONFIG_TASK_DELAY_ACCT) - extern int delayacct_on; - return delayacct_on; -#else - return 0; -#endif + return IS_ENABLED(CONFIG_SCHED_INFO); } #ifdef CONFIG_SCHEDSTATS -- cgit From eee4d9fee2544389e5ce5697ed92db67c86d7a9f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 May 2021 22:43:36 +0200 Subject: delayacct: Add static_branch in scheduler hooks Cheaper when delayacct is disabled. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Balbir Singh Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/20210505111525.248028369@infradead.org --- include/linux/delayacct.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 21651f946751..57fefa54b53a 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -58,8 +58,10 @@ struct task_delay_info { #include #include +#include #ifdef CONFIG_TASK_DELAY_ACCT +DECLARE_STATIC_KEY_TRUE(delayacct_key); extern int delayacct_on; /* Delay accounting turned on/off */ extern struct kmem_cache *delayacct_cache; extern void delayacct_init(void); @@ -114,6 +116,9 @@ static inline void delayacct_tsk_free(struct task_struct *tsk) static inline void delayacct_blkio_start(void) { + if (!static_branch_likely(&delayacct_key)) + return; + delayacct_set_flag(current, DELAYACCT_PF_BLKIO); if (current->delays) __delayacct_blkio_start(); @@ -121,6 +126,9 @@ static inline void delayacct_blkio_start(void) static inline void delayacct_blkio_end(struct task_struct *p) { + if (!static_branch_likely(&delayacct_key)) + return; + if (p->delays) __delayacct_blkio_end(p); delayacct_clear_flag(p, DELAYACCT_PF_BLKIO); -- cgit From e4042ad492357fa995921376462b04a025dd53b6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 May 2021 22:43:32 +0200 Subject: delayacct: Default disabled Assuming this stuff isn't actually used much; disable it by default and avoid allocating and tracking the task_delay_info structure. taskstats is changed to still report the regular sched and sched_info and only skip the missing task_delay_info fields instead of not reporting anything. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210505111525.308018373@infradead.org --- include/linux/delayacct.h | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 57fefa54b53a..225c8e01a111 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -61,7 +61,7 @@ struct task_delay_info { #include #ifdef CONFIG_TASK_DELAY_ACCT -DECLARE_STATIC_KEY_TRUE(delayacct_key); +DECLARE_STATIC_KEY_FALSE(delayacct_key); extern int delayacct_on; /* Delay accounting turned on/off */ extern struct kmem_cache *delayacct_cache; extern void delayacct_init(void); @@ -69,7 +69,7 @@ extern void __delayacct_tsk_init(struct task_struct *); extern void __delayacct_tsk_exit(struct task_struct *); extern void __delayacct_blkio_start(void); extern void __delayacct_blkio_end(struct task_struct *); -extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *); +extern int delayacct_add_tsk(struct taskstats *, struct task_struct *); extern __u64 __delayacct_blkio_ticks(struct task_struct *); extern void __delayacct_freepages_start(void); extern void __delayacct_freepages_end(void); @@ -116,7 +116,7 @@ static inline void delayacct_tsk_free(struct task_struct *tsk) static inline void delayacct_blkio_start(void) { - if (!static_branch_likely(&delayacct_key)) + if (!static_branch_unlikely(&delayacct_key)) return; delayacct_set_flag(current, DELAYACCT_PF_BLKIO); @@ -126,7 +126,7 @@ static inline void delayacct_blkio_start(void) static inline void delayacct_blkio_end(struct task_struct *p) { - if (!static_branch_likely(&delayacct_key)) + if (!static_branch_unlikely(&delayacct_key)) return; if (p->delays) @@ -134,14 +134,6 @@ static inline void delayacct_blkio_end(struct task_struct *p) delayacct_clear_flag(p, DELAYACCT_PF_BLKIO); } -static inline int delayacct_add_tsk(struct taskstats *d, - struct task_struct *tsk) -{ - if (!delayacct_on || !tsk->delays) - return 0; - return __delayacct_add_tsk(d, tsk); -} - static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk) { if (tsk->delays) -- cgit From 0cd7c741f01de13dc1eecf22557593b3514639bb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 May 2021 14:01:00 +0200 Subject: delayacct: Add sysctl to enable at runtime Just like sched_schedstats, allow runtime enabling (and disabling) of delayacct. This is useful if one forgot to add the delayacct boot time option. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/YJkhebGJAywaZowX@hirez.programming.kicks-ass.net --- include/linux/delayacct.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 225c8e01a111..af7e6eb50283 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -65,6 +65,10 @@ DECLARE_STATIC_KEY_FALSE(delayacct_key); extern int delayacct_on; /* Delay accounting turned on/off */ extern struct kmem_cache *delayacct_cache; extern void delayacct_init(void); + +extern int sysctl_delayacct(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); + extern void __delayacct_tsk_init(struct task_struct *); extern void __delayacct_tsk_exit(struct task_struct *); extern void __delayacct_blkio_start(void); -- cgit From 8a311c740b53324ec584e0e3bb7077d56b123c28 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Nov 2020 18:19:36 -0500 Subject: sched: Basic tracking of matching tasks Introduce task_struct::core_cookie as an opaque identifier for core scheduling. When enabled; core scheduling will only allow matching task to be on the core; where idle matches everything. When task_struct::core_cookie is set (and core scheduling is enabled) these tasks are indexed in a second RB-tree, first on cookie value then on scheduling function, such that matching task selection always finds the most elegible match. NOTE: *shudder* at the overhead... NOTE: *sigh*, a 3rd copy of the scheduling function; the alternative is per class tracking of cookies and that just duplicates a lot of stuff for no raisin (the 2nd copy lives in the rt-mutex PI code). [Joel: folded fixes] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Joel Fernandes (Google) Signed-off-by: Peter Zijlstra (Intel) Tested-by: Don Hiatt Tested-by: Hongyu Ning Tested-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210422123308.496975854@infradead.org --- include/linux/sched.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index d2c881384517..45eedccf86aa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -700,10 +700,16 @@ struct task_struct { const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; + struct sched_dl_entity dl; + +#ifdef CONFIG_SCHED_CORE + struct rb_node core_node; + unsigned long core_cookie; +#endif + #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif - struct sched_dl_entity dl; #ifdef CONFIG_UCLAMP_TASK /* -- cgit From d2dfa17bc7de67e99685c4d6557837bf801a102c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Nov 2020 18:19:43 -0500 Subject: sched: Trivial forced-newidle balancer When a sibling is forced-idle to match the core-cookie; search for matching tasks to fill the core. rcu_read_unlock() can incur an infrequent deadlock in sched_core_balance(). Fix this by using the RCU-sched flavor instead. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Don Hiatt Tested-by: Hongyu Ning Tested-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210422123308.800048269@infradead.org --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 45eedccf86aa..9b822e383212 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -705,6 +705,7 @@ struct task_struct { #ifdef CONFIG_SCHED_CORE struct rb_node core_node; unsigned long core_cookie; + unsigned int core_occupation; #endif #ifdef CONFIG_CGROUP_SCHED -- cgit From 6e33cad0af49336952e5541464bd02f5b5fd433e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Mar 2021 18:55:06 +0100 Subject: sched: Trivial core scheduling cookie management In order to not have to use pid_struct, create a new, smaller, structure to manage task cookies for core scheduling. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Don Hiatt Tested-by: Hongyu Ning Tested-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210422123308.919768100@infradead.org --- include/linux/sched.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9b822e383212..eab3f7c4251b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2179,4 +2179,10 @@ int sched_trace_rq_nr_running(struct rq *rq); const struct cpumask *sched_trace_rd_span(struct root_domain *rd); +#ifdef CONFIG_SCHED_CORE +extern void sched_core_free(struct task_struct *tsk); +#else +static inline void sched_core_free(struct task_struct *tsk) { } +#endif + #endif -- cgit From 85dd3f61203c5cfa72b308ff327b5fbf3fc1ce5e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 29 Mar 2021 15:18:35 +0200 Subject: sched: Inherit task cookie on fork() Note that sched_core_fork() is called from under tasklist_lock, and not from sched_fork() earlier. This avoids a few races later. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Don Hiatt Tested-by: Hongyu Ning Tested-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210422123308.980003687@infradead.org --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index eab3f7c4251b..fba47e52e482 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2181,8 +2181,10 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd); #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); +extern void sched_core_fork(struct task_struct *p); #else static inline void sched_core_free(struct task_struct *tsk) { } +static inline void sched_core_fork(struct task_struct *p) { } #endif #endif -- cgit From 7ac592aa35a684ff1858fb9ec282886b9e3575ac Mon Sep 17 00:00:00 2001 From: Chris Hyser Date: Wed, 24 Mar 2021 17:40:15 -0400 Subject: sched: prctl() core-scheduling interface This patch provides support for setting and copying core scheduling 'task cookies' between threads (PID), processes (TGID), and process groups (PGID). The value of core scheduling isn't that tasks don't share a core, 'nosmt' can do that. The value lies in exploiting all the sharing opportunities that exist to recover possible lost performance and that requires a degree of flexibility in the API. From a security perspective (and there are others), the thread, process and process group distinction is an existent hierarchal categorization of tasks that reflects many of the security concerns about 'data sharing'. For example, protecting against cache-snooping by a thread that can just read the memory directly isn't all that useful. With this in mind, subcommands to CREATE/SHARE (TO/FROM) provide a mechanism to create and share cookies. CREATE/SHARE_TO specify a target pid with enum pidtype used to specify the scope of the targeted tasks. For example, PIDTYPE_TGID will share the cookie with the process and all of it's threads as typically desired in a security scenario. API: prctl(PR_SCHED_CORE, PR_SCHED_CORE_GET, tgtpid, pidtype, &cookie) prctl(PR_SCHED_CORE, PR_SCHED_CORE_CREATE, tgtpid, pidtype, NULL) prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_TO, tgtpid, pidtype, NULL) prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_FROM, srcpid, pidtype, NULL) where 'tgtpid/srcpid == 0' implies the current process and pidtype is kernel enum pid_type {PIDTYPE_PID, PIDTYPE_TGID, PIDTYPE_PGID, ...}. For return values, EINVAL, ENOMEM are what they say. ESRCH means the tgtpid/srcpid was not found. EPERM indicates lack of PTRACE permission access to tgtpid/srcpid. ENODEV indicates your machines lacks SMT. [peterz: complete rewrite] Signed-off-by: Chris Hyser Signed-off-by: Peter Zijlstra (Intel) Tested-by: Don Hiatt Tested-by: Hongyu Ning Tested-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210422123309.039845339@infradead.org --- include/linux/sched.h | 2 ++ include/uapi/linux/prctl.h | 8 ++++++++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index fba47e52e482..c7e7d50e2fdc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2182,6 +2182,8 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd); #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); extern void sched_core_fork(struct task_struct *p); +extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, + unsigned long uaddr); #else static inline void sched_core_free(struct task_struct *tsk) { } static inline void sched_core_fork(struct task_struct *p) { } diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 18a9f59dc067..967d9c55323d 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -259,4 +259,12 @@ struct prctl_mm_map { #define PR_PAC_SET_ENABLED_KEYS 60 #define PR_PAC_GET_ENABLED_KEYS 61 +/* Request the scheduler to share a core */ +#define PR_SCHED_CORE 62 +# define PR_SCHED_CORE_GET 0 +# define PR_SCHED_CORE_CREATE 1 /* create unique core_sched cookie */ +# define PR_SCHED_CORE_SHARE_TO 2 /* push core_sched cookie to pid */ +# define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */ +# define PR_SCHED_CORE_MAX 4 + #endif /* _LINUX_PRCTL_H */ -- cgit From f1a0a376ca0c4ef1fc3d24e3e502acbb5b795674 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 12 May 2021 10:46:36 +0100 Subject: sched/core: Initialize the idle task with preemption disabled As pointed out by commit de9b8f5dcbd9 ("sched: Fix crash trying to dequeue/enqueue the idle thread") init_idle() can and will be invoked more than once on the same idle task. At boot time, it is invoked for the boot CPU thread by sched_init(). Then smp_init() creates the threads for all the secondary CPUs and invokes init_idle() on them. As the hotplug machinery brings the secondaries to life, it will issue calls to idle_thread_get(), which itself invokes init_idle() yet again. In this case it's invoked twice more per secondary: at _cpu_up(), and at bringup_cpu(). Given smp_init() already initializes the idle tasks for all *possible* CPUs, no further initialization should be required. Now, removing init_idle() from idle_thread_get() exposes some interesting expectations with regards to the idle task's preempt_count: the secondary startup always issues a preempt_disable(), requiring some reset of the preempt count to 0 between hot-unplug and hotplug, which is currently served by idle_thread_get() -> idle_init(). Given the idle task is supposed to have preemption disabled once and never see it re-enabled, it seems that what we actually want is to initialize its preempt_count to PREEMPT_DISABLED and leave it there. Do that, and remove init_idle() from idle_thread_get(). Secondary startups were patched via coccinelle: @begone@ @@ -preempt_disable(); ... cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20210512094636.2958515-1-valentin.schneider@arm.com --- include/asm-generic/preempt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h index d683f5e6d791..b4d43a4af5f7 100644 --- a/include/asm-generic/preempt.h +++ b/include/asm-generic/preempt.h @@ -29,7 +29,7 @@ static __always_inline void preempt_count_set(int pc) } while (0) #define init_idle_preempt_count(p, cpu) do { \ - task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \ + task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \ } while (0) static __always_inline void set_preempt_need_resched(void) -- cgit From cc00c1988801dc71f63bb7bad019e85046865095 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 12 May 2021 19:51:31 +0200 Subject: sched: Fix leftover comment typos A few more snuck in. Also capitalize 'CPU' while at it. Signed-off-by: Ingo Molnar --- include/linux/sched_clock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h index 528718e4ed52..835ee87ed792 100644 --- a/include/linux/sched_clock.h +++ b/include/linux/sched_clock.h @@ -14,7 +14,7 @@ * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit * clocks. * @read_sched_clock: Current clock source (or dummy source when suspended). - * @mult: Multipler for scaled math conversion. + * @mult: Multiplier for scaled math conversion. * @shift: Shift value for scaled math conversion. * * Care must be taken when updating this structure; it is read by -- cgit From 01aee8fd7fb23049e2b52abadbe1f7b5e94a52d2 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 22 Apr 2021 23:02:25 +0300 Subject: sched: Make nr_running() return 32-bit value Creating 2**32 tasks is impossible due to futex pid limits and wasteful anyway. Nobody has done it. Bring nr_running() into 32-bit world to save on REX prefixes. Signed-off-by: Alexey Dobriyan Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210422200228.1423391-1-adobriyan@gmail.com --- include/linux/sched/stat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h index 939c3ec9e1b9..73606b3de394 100644 --- a/include/linux/sched/stat.h +++ b/include/linux/sched/stat.h @@ -17,7 +17,7 @@ extern unsigned long total_forks; extern int nr_threads; DECLARE_PER_CPU(unsigned long, process_counts); extern int nr_processes(void); -extern unsigned long nr_running(void); +extern unsigned int nr_running(void); extern bool single_task_running(void); extern unsigned long nr_iowait(void); extern unsigned long nr_iowait_cpu(int cpu); -- cgit From 9745516841a55c77163a5d549bce1374d776df54 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 22 Apr 2021 23:02:26 +0300 Subject: sched: Make nr_iowait() return 32-bit value Creating 2**32 tasks to wait in D-state is impossible and wasteful. Return "unsigned int" and save on REX prefixes. Signed-off-by: Alexey Dobriyan Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210422200228.1423391-2-adobriyan@gmail.com --- include/linux/sched/stat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h index 73606b3de394..81d9b539e3b7 100644 --- a/include/linux/sched/stat.h +++ b/include/linux/sched/stat.h @@ -19,7 +19,7 @@ DECLARE_PER_CPU(unsigned long, process_counts); extern int nr_processes(void); extern unsigned int nr_running(void); extern bool single_task_running(void); -extern unsigned long nr_iowait(void); +extern unsigned int nr_iowait(void); extern unsigned long nr_iowait_cpu(int cpu); static inline int sched_info_on(void) -- cgit From 8fc2858e572ce761bffcade81a42ac72005e76f9 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 22 Apr 2021 23:02:27 +0300 Subject: sched: Make nr_iowait_cpu() return 32-bit value Runqueue ->nr_iowait counters are 32-bit anyway. Propagate 32-bitness into other code, but don't try too hard. Signed-off-by: Alexey Dobriyan Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210422200228.1423391-3-adobriyan@gmail.com --- include/linux/sched/stat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h index 81d9b539e3b7..0108a38bb64d 100644 --- a/include/linux/sched/stat.h +++ b/include/linux/sched/stat.h @@ -20,7 +20,7 @@ extern int nr_processes(void); extern unsigned int nr_running(void); extern bool single_task_running(void); extern unsigned int nr_iowait(void); -extern unsigned long nr_iowait_cpu(int cpu); +extern unsigned int nr_iowait_cpu(int cpu); static inline int sched_info_on(void) { -- cgit From 00b89fe0197f0c55a045775c11553c0cdb7082fe Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 10 May 2021 16:10:23 +0100 Subject: sched: Make the idle task quack like a per-CPU kthread For all intents and purposes, the idle task is a per-CPU kthread. It isn't created via the same route as other pcpu kthreads however, and as a result it is missing a few bells and whistles: it fails kthread_is_per_cpu() and it doesn't have PF_NO_SETAFFINITY set. Fix the former by giving the idle task a kthread struct along with the KTHREAD_IS_PER_CPU flag. This requires some extra iffery as init_idle() call be called more than once on the same idle task. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210510151024.2448573-2-valentin.schneider@arm.com --- include/linux/kthread.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 2484ed97e72f..d9133d6db308 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -33,6 +33,8 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), unsigned int cpu, const char *namefmt); +void set_kthread_struct(struct task_struct *p); + void kthread_set_per_cpu(struct task_struct *k, int cpu); bool kthread_is_per_cpu(struct task_struct *k); -- cgit From 8f1b971b4750e83e8fbd2f91a9efd4a38ad0ae51 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Mon, 14 Jun 2021 20:12:38 +0100 Subject: sched/cpufreq: Consider reduced CPU capacity in energy calculation Energy Aware Scheduling (EAS) needs to predict the decisions made by SchedUtil. The map_util_freq() exists to do that. There are corner cases where the max allowed frequency might be reduced (due to thermal). SchedUtil as a CPUFreq governor, is aware of that but EAS is not. This patch aims to address it. SchedUtil stores the maximum allowed frequency in 'sugov_policy::next_freq' field. EAS has to predict that value, which is the real used frequency. That value is made after a call to cpufreq_driver_resolve_freq() which clamps to the CPUFreq policy limits. In the existing code EAS is not able to predict that real frequency. This leads to energy estimation errors. To avoid wrong energy estimation in EAS (due to frequency miss prediction) make sure that the step which calculates Performance Domain frequency, is also aware of the allowed CPU capacity. Furthermore, modify map_util_freq() to not extend the frequency value. Instead, use map_util_perf() to extend the util value in both places: SchedUtil and EAS, but for EAS clamp it to max allowed CPU capacity. In the end, we achieve the same desirable behavior for both subsystems and alignment in regards to the real CPU frequency. Signed-off-by: Lukasz Luba Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rafael J. Wysocki (For the schedutil part) Link: https://lore.kernel.org/r/20210614191238.23224-1-lukasz.luba@arm.com --- include/linux/energy_model.h | 16 +++++++++++++--- include/linux/sched/cpufreq.h | 2 +- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 757fc60658fa..3f221dbf5f95 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -91,6 +91,8 @@ void em_dev_unregister_perf_domain(struct device *dev); * @pd : performance domain for which energy has to be estimated * @max_util : highest utilization among CPUs of the domain * @sum_util : sum of the utilization of all CPUs in the domain + * @allowed_cpu_cap : maximum allowed CPU capacity for the @pd, which + might reflect reduced frequency (due to thermal) * * This function must be used only for CPU devices. There is no validation, * i.e. if the EM is a CPU type and has cpumask allocated. It is called from @@ -100,7 +102,8 @@ void em_dev_unregister_perf_domain(struct device *dev); * a capacity state satisfying the max utilization of the domain. */ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, - unsigned long max_util, unsigned long sum_util) + unsigned long max_util, unsigned long sum_util, + unsigned long allowed_cpu_cap) { unsigned long freq, scale_cpu; struct em_perf_state *ps; @@ -112,11 +115,17 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, /* * In order to predict the performance state, map the utilization of * the most utilized CPU of the performance domain to a requested - * frequency, like schedutil. + * frequency, like schedutil. Take also into account that the real + * frequency might be set lower (due to thermal capping). Thus, clamp + * max utilization to the allowed CPU capacity before calculating + * effective frequency. */ cpu = cpumask_first(to_cpumask(pd->cpus)); scale_cpu = arch_scale_cpu_capacity(cpu); ps = &pd->table[pd->nr_perf_states - 1]; + + max_util = map_util_perf(max_util); + max_util = min(max_util, allowed_cpu_cap); freq = map_util_freq(max_util, ps->frequency, scale_cpu); /* @@ -209,7 +218,8 @@ static inline struct em_perf_domain *em_pd_get(struct device *dev) return NULL; } static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, - unsigned long max_util, unsigned long sum_util) + unsigned long max_util, unsigned long sum_util, + unsigned long allowed_cpu_cap) { return 0; } diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h index 6205578ab6ee..bdd31ab93bc5 100644 --- a/include/linux/sched/cpufreq.h +++ b/include/linux/sched/cpufreq.h @@ -26,7 +26,7 @@ bool cpufreq_this_cpu_can_update(struct cpufreq_policy *policy); static inline unsigned long map_util_freq(unsigned long util, unsigned long freq, unsigned long cap) { - return (freq + (freq >> 2)) * util / cap; + return freq * util / cap; } static inline unsigned long map_util_perf(unsigned long util) -- cgit From b03fbd4ff24c5f075e58eb19261d5f8b3e40d7c6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Jun 2021 10:28:12 +0200 Subject: sched: Introduce task_is_running() Replace a bunch of 'p->state == TASK_RUNNING' with a new helper: task_is_running(p). Signed-off-by: Peter Zijlstra (Intel) Acked-by: Davidlohr Bueso Acked-by: Geert Uytterhoeven Acked-by: Will Deacon Link: https://lore.kernel.org/r/20210611082838.222401495@infradead.org --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index ac5a7d29fd4f..2cd56352dae1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -113,6 +113,8 @@ struct task_group; __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ TASK_PARKED) +#define task_is_running(task) (READ_ONCE((task)->state) == TASK_RUNNING) + #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) -- cgit From d6c23bb3a2ad2f8f7dd46292b8bc54d27f2fb3f1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Jun 2021 10:28:14 +0200 Subject: sched: Add get_current_state() Remove yet another few p->state accesses. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Link: https://lore.kernel.org/r/20210611082838.347475156@infradead.org --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2cd56352dae1..395c8906f502 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -213,6 +213,8 @@ struct task_group; #endif +#define get_current_state() READ_ONCE(current->state) + /* Task command name length: */ #define TASK_COMM_LEN 16 -- cgit From 2f064a59a11ff9bc22e52e9678bc601404c7cb34 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Jun 2021 10:28:17 +0200 Subject: sched: Change task_struct::state Change the type and name of task_struct::state. Drop the volatile and shrink it to an 'unsigned int'. Rename it in order to find all uses such that we can use READ_ONCE/WRITE_ONCE as appropriate. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Acked-by: Will Deacon Acked-by: Daniel Thompson Link: https://lore.kernel.org/r/20210611082838.550736351@infradead.org --- include/linux/sched.h | 31 +++++++++++++++---------------- include/linux/sched/debug.h | 2 +- include/linux/sched/signal.h | 2 +- 3 files changed, 17 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 395c8906f502..50db9496c99d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -113,13 +113,13 @@ struct task_group; __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ TASK_PARKED) -#define task_is_running(task) (READ_ONCE((task)->state) == TASK_RUNNING) +#define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) -#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) +#define task_is_traced(task) ((READ_ONCE(task->__state) & __TASK_TRACED) != 0) -#define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) +#define task_is_stopped(task) ((READ_ONCE(task->__state) & __TASK_STOPPED) != 0) -#define task_is_stopped_or_traced(task) ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) +#define task_is_stopped_or_traced(task) ((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP @@ -134,14 +134,14 @@ struct task_group; do { \ WARN_ON_ONCE(is_special_task_state(state_value));\ current->task_state_change = _THIS_IP_; \ - current->state = (state_value); \ + WRITE_ONCE(current->__state, (state_value)); \ } while (0) #define set_current_state(state_value) \ do { \ WARN_ON_ONCE(is_special_task_state(state_value));\ current->task_state_change = _THIS_IP_; \ - smp_store_mb(current->state, (state_value)); \ + smp_store_mb(current->__state, (state_value)); \ } while (0) #define set_special_state(state_value) \ @@ -150,7 +150,7 @@ struct task_group; WARN_ON_ONCE(!is_special_task_state(state_value)); \ raw_spin_lock_irqsave(¤t->pi_lock, flags); \ current->task_state_change = _THIS_IP_; \ - current->state = (state_value); \ + WRITE_ONCE(current->__state, (state_value)); \ raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \ } while (0) #else @@ -192,10 +192,10 @@ struct task_group; * Also see the comments of try_to_wake_up(). */ #define __set_current_state(state_value) \ - current->state = (state_value) + WRITE_ONCE(current->__state, (state_value)) #define set_current_state(state_value) \ - smp_store_mb(current->state, (state_value)) + smp_store_mb(current->__state, (state_value)) /* * set_special_state() should be used for those states when the blocking task @@ -207,13 +207,13 @@ struct task_group; do { \ unsigned long flags; /* may shadow */ \ raw_spin_lock_irqsave(¤t->pi_lock, flags); \ - current->state = (state_value); \ + WRITE_ONCE(current->__state, (state_value)); \ raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \ } while (0) #endif -#define get_current_state() READ_ONCE(current->state) +#define get_current_state() READ_ONCE(current->__state) /* Task command name length: */ #define TASK_COMM_LEN 16 @@ -666,8 +666,7 @@ struct task_struct { */ struct thread_info thread_info; #endif - /* -1 unrunnable, 0 runnable, >0 stopped: */ - volatile long state; + unsigned int __state; /* * This begins the randomizable portion of task_struct. Only @@ -1532,7 +1531,7 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk) static inline unsigned int task_state_index(struct task_struct *tsk) { - unsigned int tsk_state = READ_ONCE(tsk->state); + unsigned int tsk_state = READ_ONCE(tsk->__state); unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); @@ -1840,10 +1839,10 @@ static __always_inline void scheduler_ipi(void) */ preempt_fold_need_resched(); } -extern unsigned long wait_task_inactive(struct task_struct *, long match_state); +extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); #else static inline void scheduler_ipi(void) { } -static inline unsigned long wait_task_inactive(struct task_struct *p, long match_state) +static inline unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) { return 1; } diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h index ae51f4529fc9..b5035afa2396 100644 --- a/include/linux/sched/debug.h +++ b/include/linux/sched/debug.h @@ -14,7 +14,7 @@ extern void dump_cpu_task(int cpu); /* * Only dump TASK_* tasks. (0 for all tasks) */ -extern void show_state_filter(unsigned long state_filter); +extern void show_state_filter(unsigned int state_filter); static inline void show_state(void) { diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 7f4278fa21fe..c9cf678c347d 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -382,7 +382,7 @@ static inline int fatal_signal_pending(struct task_struct *p) return task_sigpending(p) && __fatal_signal_pending(p); } -static inline int signal_pending_state(long state, struct task_struct *p) +static inline int signal_pending_state(unsigned int state, struct task_struct *p) { if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL))) return 0; -- cgit From 2309a05d2abe713f7debc951640b010370c8befb Mon Sep 17 00:00:00 2001 From: Beata Michalska Date: Thu, 3 Jun 2021 15:06:25 +0100 Subject: sched/core: Introduce SD_ASYM_CPUCAPACITY_FULL sched_domain flag Introducing new, complementary to SD_ASYM_CPUCAPACITY, sched_domain topology flag, to distinguish between shed_domains where any CPU capacity asymmetry is detected (SD_ASYM_CPUCAPACITY) and ones where a full set of CPU capacities is visible to all domain members (SD_ASYM_CPUCAPACITY_FULL). With the distinction between full and partial CPU capacity asymmetry, brought in by the newly introduced flag, the scope of the original SD_ASYM_CPUCAPACITY flag gets shifted, still maintaining the existing behaviour when one is detected on a given sched domain, allowing misfit migrations within sched domains that do not observe full range of CPU capacities but still do have members with different capacity values. It loses though it's meaning when it comes to the lowest CPU asymmetry sched_domain level per-cpu pointer, which is to be now denoted by SD_ASYM_CPUCAPACITY_FULL flag. Signed-off-by: Beata Michalska Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20210603140627.8409-2-beata.michalska@arm.com --- include/linux/sched/sd_flags.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index 34b21e971d77..57bde66d95f7 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -90,6 +90,16 @@ SD_FLAG(SD_WAKE_AFFINE, SDF_SHARED_CHILD) */ SD_FLAG(SD_ASYM_CPUCAPACITY, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) +/* + * Domain members have different CPU capacities spanning all unique CPU + * capacity values. + * + * SHARED_PARENT: Set from the topmost domain down to the first domain where + * all available CPU capacities are visible + * NEEDS_GROUPS: Per-CPU capacity is asymmetric between groups. + */ +SD_FLAG(SD_ASYM_CPUCAPACITY_FULL, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) + /* * Domain members share CPU capacity (i.e. SMT) * -- cgit