From 8a311c740b53324ec584e0e3bb7077d56b123c28 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Nov 2020 18:19:36 -0500 Subject: sched: Basic tracking of matching tasks Introduce task_struct::core_cookie as an opaque identifier for core scheduling. When enabled; core scheduling will only allow matching task to be on the core; where idle matches everything. When task_struct::core_cookie is set (and core scheduling is enabled) these tasks are indexed in a second RB-tree, first on cookie value then on scheduling function, such that matching task selection always finds the most elegible match. NOTE: *shudder* at the overhead... NOTE: *sigh*, a 3rd copy of the scheduling function; the alternative is per class tracking of cookies and that just duplicates a lot of stuff for no raisin (the 2nd copy lives in the rt-mutex PI code). [Joel: folded fixes] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Joel Fernandes (Google) Signed-off-by: Peter Zijlstra (Intel) Tested-by: Don Hiatt Tested-by: Hongyu Ning Tested-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210422123308.496975854@infradead.org --- include/linux/sched.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index d2c881384517..45eedccf86aa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -700,10 +700,16 @@ struct task_struct { const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; + struct sched_dl_entity dl; + +#ifdef CONFIG_SCHED_CORE + struct rb_node core_node; + unsigned long core_cookie; +#endif + #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif - struct sched_dl_entity dl; #ifdef CONFIG_UCLAMP_TASK /* -- cgit From d2dfa17bc7de67e99685c4d6557837bf801a102c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Nov 2020 18:19:43 -0500 Subject: sched: Trivial forced-newidle balancer When a sibling is forced-idle to match the core-cookie; search for matching tasks to fill the core. rcu_read_unlock() can incur an infrequent deadlock in sched_core_balance(). Fix this by using the RCU-sched flavor instead. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Don Hiatt Tested-by: Hongyu Ning Tested-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210422123308.800048269@infradead.org --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 45eedccf86aa..9b822e383212 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -705,6 +705,7 @@ struct task_struct { #ifdef CONFIG_SCHED_CORE struct rb_node core_node; unsigned long core_cookie; + unsigned int core_occupation; #endif #ifdef CONFIG_CGROUP_SCHED -- cgit From 6e33cad0af49336952e5541464bd02f5b5fd433e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Mar 2021 18:55:06 +0100 Subject: sched: Trivial core scheduling cookie management In order to not have to use pid_struct, create a new, smaller, structure to manage task cookies for core scheduling. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Don Hiatt Tested-by: Hongyu Ning Tested-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210422123308.919768100@infradead.org --- include/linux/sched.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9b822e383212..eab3f7c4251b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2179,4 +2179,10 @@ int sched_trace_rq_nr_running(struct rq *rq); const struct cpumask *sched_trace_rd_span(struct root_domain *rd); +#ifdef CONFIG_SCHED_CORE +extern void sched_core_free(struct task_struct *tsk); +#else +static inline void sched_core_free(struct task_struct *tsk) { } +#endif + #endif -- cgit From 85dd3f61203c5cfa72b308ff327b5fbf3fc1ce5e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 29 Mar 2021 15:18:35 +0200 Subject: sched: Inherit task cookie on fork() Note that sched_core_fork() is called from under tasklist_lock, and not from sched_fork() earlier. This avoids a few races later. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Don Hiatt Tested-by: Hongyu Ning Tested-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210422123308.980003687@infradead.org --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index eab3f7c4251b..fba47e52e482 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2181,8 +2181,10 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd); #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); +extern void sched_core_fork(struct task_struct *p); #else static inline void sched_core_free(struct task_struct *tsk) { } +static inline void sched_core_fork(struct task_struct *p) { } #endif #endif -- cgit From 7ac592aa35a684ff1858fb9ec282886b9e3575ac Mon Sep 17 00:00:00 2001 From: Chris Hyser Date: Wed, 24 Mar 2021 17:40:15 -0400 Subject: sched: prctl() core-scheduling interface This patch provides support for setting and copying core scheduling 'task cookies' between threads (PID), processes (TGID), and process groups (PGID). The value of core scheduling isn't that tasks don't share a core, 'nosmt' can do that. The value lies in exploiting all the sharing opportunities that exist to recover possible lost performance and that requires a degree of flexibility in the API. From a security perspective (and there are others), the thread, process and process group distinction is an existent hierarchal categorization of tasks that reflects many of the security concerns about 'data sharing'. For example, protecting against cache-snooping by a thread that can just read the memory directly isn't all that useful. With this in mind, subcommands to CREATE/SHARE (TO/FROM) provide a mechanism to create and share cookies. CREATE/SHARE_TO specify a target pid with enum pidtype used to specify the scope of the targeted tasks. For example, PIDTYPE_TGID will share the cookie with the process and all of it's threads as typically desired in a security scenario. API: prctl(PR_SCHED_CORE, PR_SCHED_CORE_GET, tgtpid, pidtype, &cookie) prctl(PR_SCHED_CORE, PR_SCHED_CORE_CREATE, tgtpid, pidtype, NULL) prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_TO, tgtpid, pidtype, NULL) prctl(PR_SCHED_CORE, PR_SCHED_CORE_SHARE_FROM, srcpid, pidtype, NULL) where 'tgtpid/srcpid == 0' implies the current process and pidtype is kernel enum pid_type {PIDTYPE_PID, PIDTYPE_TGID, PIDTYPE_PGID, ...}. For return values, EINVAL, ENOMEM are what they say. ESRCH means the tgtpid/srcpid was not found. EPERM indicates lack of PTRACE permission access to tgtpid/srcpid. ENODEV indicates your machines lacks SMT. [peterz: complete rewrite] Signed-off-by: Chris Hyser Signed-off-by: Peter Zijlstra (Intel) Tested-by: Don Hiatt Tested-by: Hongyu Ning Tested-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210422123309.039845339@infradead.org --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index fba47e52e482..c7e7d50e2fdc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2182,6 +2182,8 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd); #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); extern void sched_core_fork(struct task_struct *p); +extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, + unsigned long uaddr); #else static inline void sched_core_free(struct task_struct *tsk) { } static inline void sched_core_fork(struct task_struct *p) { } -- cgit From b03fbd4ff24c5f075e58eb19261d5f8b3e40d7c6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Jun 2021 10:28:12 +0200 Subject: sched: Introduce task_is_running() Replace a bunch of 'p->state == TASK_RUNNING' with a new helper: task_is_running(p). Signed-off-by: Peter Zijlstra (Intel) Acked-by: Davidlohr Bueso Acked-by: Geert Uytterhoeven Acked-by: Will Deacon Link: https://lore.kernel.org/r/20210611082838.222401495@infradead.org --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index ac5a7d29fd4f..2cd56352dae1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -113,6 +113,8 @@ struct task_group; __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ TASK_PARKED) +#define task_is_running(task) (READ_ONCE((task)->state) == TASK_RUNNING) + #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) -- cgit From d6c23bb3a2ad2f8f7dd46292b8bc54d27f2fb3f1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Jun 2021 10:28:14 +0200 Subject: sched: Add get_current_state() Remove yet another few p->state accesses. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Link: https://lore.kernel.org/r/20210611082838.347475156@infradead.org --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2cd56352dae1..395c8906f502 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -213,6 +213,8 @@ struct task_group; #endif +#define get_current_state() READ_ONCE(current->state) + /* Task command name length: */ #define TASK_COMM_LEN 16 -- cgit From 2f064a59a11ff9bc22e52e9678bc601404c7cb34 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Jun 2021 10:28:17 +0200 Subject: sched: Change task_struct::state Change the type and name of task_struct::state. Drop the volatile and shrink it to an 'unsigned int'. Rename it in order to find all uses such that we can use READ_ONCE/WRITE_ONCE as appropriate. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Acked-by: Will Deacon Acked-by: Daniel Thompson Link: https://lore.kernel.org/r/20210611082838.550736351@infradead.org --- include/linux/sched.h | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 395c8906f502..50db9496c99d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -113,13 +113,13 @@ struct task_group; __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ TASK_PARKED) -#define task_is_running(task) (READ_ONCE((task)->state) == TASK_RUNNING) +#define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) -#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) +#define task_is_traced(task) ((READ_ONCE(task->__state) & __TASK_TRACED) != 0) -#define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) +#define task_is_stopped(task) ((READ_ONCE(task->__state) & __TASK_STOPPED) != 0) -#define task_is_stopped_or_traced(task) ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) +#define task_is_stopped_or_traced(task) ((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP @@ -134,14 +134,14 @@ struct task_group; do { \ WARN_ON_ONCE(is_special_task_state(state_value));\ current->task_state_change = _THIS_IP_; \ - current->state = (state_value); \ + WRITE_ONCE(current->__state, (state_value)); \ } while (0) #define set_current_state(state_value) \ do { \ WARN_ON_ONCE(is_special_task_state(state_value));\ current->task_state_change = _THIS_IP_; \ - smp_store_mb(current->state, (state_value)); \ + smp_store_mb(current->__state, (state_value)); \ } while (0) #define set_special_state(state_value) \ @@ -150,7 +150,7 @@ struct task_group; WARN_ON_ONCE(!is_special_task_state(state_value)); \ raw_spin_lock_irqsave(¤t->pi_lock, flags); \ current->task_state_change = _THIS_IP_; \ - current->state = (state_value); \ + WRITE_ONCE(current->__state, (state_value)); \ raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \ } while (0) #else @@ -192,10 +192,10 @@ struct task_group; * Also see the comments of try_to_wake_up(). */ #define __set_current_state(state_value) \ - current->state = (state_value) + WRITE_ONCE(current->__state, (state_value)) #define set_current_state(state_value) \ - smp_store_mb(current->state, (state_value)) + smp_store_mb(current->__state, (state_value)) /* * set_special_state() should be used for those states when the blocking task @@ -207,13 +207,13 @@ struct task_group; do { \ unsigned long flags; /* may shadow */ \ raw_spin_lock_irqsave(¤t->pi_lock, flags); \ - current->state = (state_value); \ + WRITE_ONCE(current->__state, (state_value)); \ raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \ } while (0) #endif -#define get_current_state() READ_ONCE(current->state) +#define get_current_state() READ_ONCE(current->__state) /* Task command name length: */ #define TASK_COMM_LEN 16 @@ -666,8 +666,7 @@ struct task_struct { */ struct thread_info thread_info; #endif - /* -1 unrunnable, 0 runnable, >0 stopped: */ - volatile long state; + unsigned int __state; /* * This begins the randomizable portion of task_struct. Only @@ -1532,7 +1531,7 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk) static inline unsigned int task_state_index(struct task_struct *tsk) { - unsigned int tsk_state = READ_ONCE(tsk->state); + unsigned int tsk_state = READ_ONCE(tsk->__state); unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); @@ -1840,10 +1839,10 @@ static __always_inline void scheduler_ipi(void) */ preempt_fold_need_resched(); } -extern unsigned long wait_task_inactive(struct task_struct *, long match_state); +extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); #else static inline void scheduler_ipi(void) { } -static inline unsigned long wait_task_inactive(struct task_struct *p, long match_state) +static inline unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) { return 1; } -- cgit