From efd984c481abb516fab8bafb25bf41fd9397a43c Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 23 Aug 2021 12:16:59 +0100 Subject: sched/fair: Add NOHZ balancer flag for nohz.next_balance updates A following patch will trigger NOHZ idle balances as a means to update nohz.next_balance. Vincent noted that blocked load updates can have non-negligible overhead, which should be avoided if the intent is to only update nohz.next_balance. Add a new NOHZ balance kick flag, NOHZ_NEXT_KICK. Gate NOHZ blocked load update by the presence of NOHZ_STATS_KICK - currently all NOHZ balance kicks will have the NOHZ_STATS_KICK flag set, so no change in behaviour is expected. Suggested-by: Vincent Guittot Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210823111700.2842997-2-valentin.schneider@arm.com --- kernel/sched/fair.c | 24 ++++++++++++++---------- kernel/sched/sched.h | 8 +++++++- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f6a05d9b5443..f4de7f555cbe 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10375,7 +10375,7 @@ static void nohz_balancer_kick(struct rq *rq) goto out; if (rq->nr_running >= 2) { - flags = NOHZ_KICK_MASK; + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto out; } @@ -10389,7 +10389,7 @@ static void nohz_balancer_kick(struct rq *rq) * on. */ if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { - flags = NOHZ_KICK_MASK; + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } } @@ -10403,7 +10403,7 @@ static void nohz_balancer_kick(struct rq *rq) */ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { if (sched_asym_prefer(i, cpu)) { - flags = NOHZ_KICK_MASK; + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } } @@ -10416,7 +10416,7 @@ static void nohz_balancer_kick(struct rq *rq) * to run the misfit task on. */ if (check_misfit_status(rq, sd)) { - flags = NOHZ_KICK_MASK; + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } @@ -10443,7 +10443,7 @@ static void nohz_balancer_kick(struct rq *rq) */ nr_busy = atomic_read(&sds->nr_busy_cpus); if (nr_busy > 1) { - flags = NOHZ_KICK_MASK; + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } } @@ -10605,7 +10605,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags, * setting the flag, we are sure to not clear the state and not * check the load of an idle cpu. */ - WRITE_ONCE(nohz.has_blocked, 0); + if (flags & NOHZ_STATS_KICK) + WRITE_ONCE(nohz.has_blocked, 0); /* * Ensures that if we miss the CPU, we must see the has_blocked @@ -10627,13 +10628,15 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags, * balancing owner will pick it up. */ if (need_resched()) { - has_blocked_load = true; + if (flags & NOHZ_STATS_KICK) + has_blocked_load = true; goto abort; } rq = cpu_rq(balance_cpu); - has_blocked_load |= update_nohz_stats(rq); + if (flags & NOHZ_STATS_KICK) + has_blocked_load |= update_nohz_stats(rq); /* * If time for next balance is due, @@ -10664,8 +10667,9 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags, if (likely(update_next_balance)) nohz.next_balance = next_balance; - WRITE_ONCE(nohz.next_blocked, - now + msecs_to_jiffies(LOAD_AVG_PERIOD)); + if (flags & NOHZ_STATS_KICK) + WRITE_ONCE(nohz.next_blocked, + now + msecs_to_jiffies(LOAD_AVG_PERIOD)); abort: /* There is still blocked load, enable periodic update */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3d3e5793e117..1fec3132d57e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2709,12 +2709,18 @@ extern void cfs_bandwidth_usage_dec(void); #define NOHZ_BALANCE_KICK_BIT 0 #define NOHZ_STATS_KICK_BIT 1 #define NOHZ_NEWILB_KICK_BIT 2 +#define NOHZ_NEXT_KICK_BIT 3 +/* Run rebalance_domains() */ #define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) +/* Update blocked load */ #define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) +/* Update blocked load when entering idle */ #define NOHZ_NEWILB_KICK BIT(NOHZ_NEWILB_KICK_BIT) +/* Update nohz.next_balance */ +#define NOHZ_NEXT_KICK BIT(NOHZ_NEXT_KICK_BIT) -#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK) +#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK) #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) -- cgit From 7fd7a9e0caba10829b4f8db1aa7711b558681fd4 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 23 Aug 2021 12:17:00 +0100 Subject: sched/fair: Trigger nohz.next_balance updates when a CPU goes NOHZ-idle Consider a system with some NOHZ-idle CPUs, such that nohz.idle_cpus_mask = S nohz.next_balance = T When a new CPU k goes NOHZ idle (nohz_balance_enter_idle()), we end up with: nohz.idle_cpus_mask = S \U {k} nohz.next_balance = T Note that the nohz.next_balance hasn't changed - it won't be updated until a NOHZ balance is triggered. This is problematic if the newly NOHZ idle CPU has an earlier rq.next_balance than the other NOHZ idle CPUs, IOW if: cpu_rq(k).next_balance < nohz.next_balance In such scenarios, the existing nohz.next_balance will prevent any NOHZ balance from happening, which itself will prevent nohz.next_balance from being updated to this new cpu_rq(k).next_balance. Unnecessary load balance delays of over 12ms caused by this were observed on an arm64 RB5 board. Use the new nohz.needs_update flag to mark the presence of newly-idle CPUs that need their rq->next_balance to be collated into nohz.next_balance. Trigger a NOHZ_NEXT_KICK when the flag is set. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210823111700.2842997-3-valentin.schneider@arm.com --- kernel/sched/fair.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f4de7f555cbe..6cc958e0584d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5787,6 +5787,7 @@ static struct { cpumask_var_t idle_cpus_mask; atomic_t nr_cpus; int has_blocked; /* Idle CPUS has blocked load */ + int needs_update; /* Newly idle CPUs need their next_balance collated */ unsigned long next_balance; /* in jiffy units */ unsigned long next_blocked; /* Next update of blocked load in jiffies */ } nohz ____cacheline_aligned; @@ -10450,6 +10451,9 @@ static void nohz_balancer_kick(struct rq *rq) unlock: rcu_read_unlock(); out: + if (READ_ONCE(nohz.needs_update)) + flags |= NOHZ_NEXT_KICK; + if (flags) kick_ilb(flags); } @@ -10546,12 +10550,13 @@ void nohz_balance_enter_idle(int cpu) /* * Ensures that if nohz_idle_balance() fails to observe our * @idle_cpus_mask store, it must observe the @has_blocked - * store. + * and @needs_update stores. */ smp_mb__after_atomic(); set_cpu_sd_state_idle(cpu); + WRITE_ONCE(nohz.needs_update, 1); out: /* * Each time a cpu enter idle, we assume that it has blocked load and @@ -10600,13 +10605,17 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags, /* * We assume there will be no idle load after this update and clear * the has_blocked flag. If a cpu enters idle in the mean time, it will - * set the has_blocked flag and trig another update of idle load. + * set the has_blocked flag and trigger another update of idle load. * Because a cpu that becomes idle, is added to idle_cpus_mask before * setting the flag, we are sure to not clear the state and not * check the load of an idle cpu. + * + * Same applies to idle_cpus_mask vs needs_update. */ if (flags & NOHZ_STATS_KICK) WRITE_ONCE(nohz.has_blocked, 0); + if (flags & NOHZ_NEXT_KICK) + WRITE_ONCE(nohz.needs_update, 0); /* * Ensures that if we miss the CPU, we must see the has_blocked @@ -10630,6 +10639,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags, if (need_resched()) { if (flags & NOHZ_STATS_KICK) has_blocked_load = true; + if (flags & NOHZ_NEXT_KICK) + WRITE_ONCE(nohz.needs_update, 1); goto abort; } -- cgit From c33627e9a1143afb988fb98d917c4a2faa16f9d9 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 26 Aug 2021 19:04:08 +0200 Subject: sched: Switch wait_task_inactive to HRTIMER_MODE_REL_HARD With PREEMPT_RT enabled all hrtimers callbacks will be invoked in softirq mode unless they are explicitly marked as HRTIMER_MODE_HARD. During boot kthread_bind() is used for the creation of per-CPU threads and then hangs in wait_task_inactive() if the ksoftirqd is not yet up and running. The hang disappeared since commit 26c7295be0c5e ("kthread: Do not preempt current task if it is going to call schedule()") but enabling function trace on boot reliably leads to the freeze on boot behaviour again. The timer in wait_task_inactive() can not be directly used by a user interface to abuse it and create a mass wake up of several tasks at the same time leading to long sections with disabled interrupts. Therefore it is safe to make the timer HRTIMER_MODE_REL_HARD. Switch the timer to HRTIMER_MODE_REL_HARD. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210826170408.vm7rlj7odslshwch@linutronix.de --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1bba4128a3e6..267269473d84 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3251,7 +3251,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state ktime_t to = NSEC_PER_SEC / HZ; set_current_state(TASK_UNINTERRUPTIBLE); - schedule_hrtimeout(&to, HRTIMER_MODE_REL); + schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); continue; } -- cgit From bc9ffef31bf59819c9fc032178534ff9ed7c4981 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Aug 2021 11:05:47 +0200 Subject: sched/core: Simplify core-wide task selection Tao suggested a two-pass task selection to avoid the retry loop. Not only does it avoid the retry loop, it results in *much* simpler code. This also fixes an issue spotted by Josh Don where, for SMT3+, we can forget to update max on the first pass and get to do an extra round. Suggested-by: Tao Zhou Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Josh Don Reviewed-by: Vineeth Pillai (Microsoft) Link: https://lkml.kernel.org/r/YSS9+k1teA9oPEKl@hirez.programming.kicks-ass.net --- kernel/sched/core.c | 156 +++++++++++++++------------------------------------- 1 file changed, 45 insertions(+), 111 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 267269473d84..f963c8113375 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5580,8 +5580,7 @@ restart: return p; } - /* The idle class should always have a runnable task: */ - BUG(); + BUG(); /* The idle class should always have a runnable task. */ } #ifdef CONFIG_SCHED_CORE @@ -5603,54 +5602,18 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b) return a->core_cookie == b->core_cookie; } -// XXX fairness/fwd progress conditions -/* - * Returns - * - NULL if there is no runnable task for this class. - * - the highest priority task for this runqueue if it matches - * rq->core->core_cookie or its priority is greater than max. - * - Else returns idle_task. - */ -static struct task_struct * -pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max, bool in_fi) +static inline struct task_struct *pick_task(struct rq *rq) { - struct task_struct *class_pick, *cookie_pick; - unsigned long cookie = rq->core->core_cookie; - - class_pick = class->pick_task(rq); - if (!class_pick) - return NULL; - - if (!cookie) { - /* - * If class_pick is tagged, return it only if it has - * higher priority than max. - */ - if (max && class_pick->core_cookie && - prio_less(class_pick, max, in_fi)) - return idle_sched_class.pick_task(rq); + const struct sched_class *class; + struct task_struct *p; - return class_pick; + for_each_class(class) { + p = class->pick_task(rq); + if (p) + return p; } - /* - * If class_pick is idle or matches cookie, return early. - */ - if (cookie_equals(class_pick, cookie)) - return class_pick; - - cookie_pick = sched_core_find(rq, cookie); - - /* - * If class > max && class > cookie, it is the highest priority task on - * the core (so far) and it must be selected, otherwise we must go with - * the cookie pick in order to satisfy the constraint. - */ - if (prio_less(cookie_pick, class_pick, in_fi) && - (!max || prio_less(max, class_pick, in_fi))) - return class_pick; - - return cookie_pick; + BUG(); /* The idle class should always have a runnable task. */ } extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi); @@ -5658,11 +5621,12 @@ extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_f static struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - struct task_struct *next, *max = NULL; - const struct sched_class *class; + struct task_struct *next, *p, *max = NULL; const struct cpumask *smt_mask; bool fi_before = false; - int i, j, cpu, occ = 0; + unsigned long cookie; + int i, cpu, occ = 0; + struct rq *rq_i; bool need_sync; if (!sched_core_enabled(rq)) @@ -5735,12 +5699,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * and there are no cookied tasks running on siblings. */ if (!need_sync) { - for_each_class(class) { - next = class->pick_task(rq); - if (next) - break; - } - + next = pick_task(rq); if (!next->core_cookie) { rq->core_pick = NULL; /* @@ -5753,76 +5712,51 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } } - for_each_cpu(i, smt_mask) { - struct rq *rq_i = cpu_rq(i); - - rq_i->core_pick = NULL; + /* + * For each thread: do the regular task pick and find the max prio task + * amongst them. + * + * Tie-break prio towards the current CPU + */ + for_each_cpu_wrap(i, smt_mask, cpu) { + rq_i = cpu_rq(i); if (i != cpu) update_rq_clock(rq_i); + + p = rq_i->core_pick = pick_task(rq_i); + if (!max || prio_less(max, p, fi_before)) + max = p; } + cookie = rq->core->core_cookie = max->core_cookie; + /* - * Try and select tasks for each sibling in descending sched_class - * order. + * For each thread: try and find a runnable task that matches @max or + * force idle. */ - for_each_class(class) { -again: - for_each_cpu_wrap(i, smt_mask, cpu) { - struct rq *rq_i = cpu_rq(i); - struct task_struct *p; - - if (rq_i->core_pick) - continue; + for_each_cpu(i, smt_mask) { + rq_i = cpu_rq(i); + p = rq_i->core_pick; - /* - * If this sibling doesn't yet have a suitable task to - * run; ask for the most eligible task, given the - * highest priority task already selected for this - * core. - */ - p = pick_task(rq_i, class, max, fi_before); + if (!cookie_equals(p, cookie)) { + p = NULL; + if (cookie) + p = sched_core_find(rq_i, cookie); if (!p) - continue; + p = idle_sched_class.pick_task(rq_i); + } - if (!is_task_rq_idle(p)) - occ++; + rq_i->core_pick = p; - rq_i->core_pick = p; - if (rq_i->idle == p && rq_i->nr_running) { + if (p == rq_i->idle) { + if (rq_i->nr_running) { rq->core->core_forceidle = true; if (!fi_before) rq->core->core_forceidle_seq++; } - - /* - * If this new candidate is of higher priority than the - * previous; and they're incompatible; we need to wipe - * the slate and start over. pick_task makes sure that - * p's priority is more than max if it doesn't match - * max's cookie. - * - * NOTE: this is a linear max-filter and is thus bounded - * in execution time. - */ - if (!max || !cookie_match(max, p)) { - struct task_struct *old_max = max; - - rq->core->core_cookie = p->core_cookie; - max = p; - - if (old_max) { - rq->core->core_forceidle = false; - for_each_cpu(j, smt_mask) { - if (j == i) - continue; - - cpu_rq(j)->core_pick = NULL; - } - occ = 1; - goto again; - } - } + } else { + occ++; } } @@ -5842,7 +5776,7 @@ again: * non-matching user state. */ for_each_cpu(i, smt_mask) { - struct rq *rq_i = cpu_rq(i); + rq_i = cpu_rq(i); /* * An online sibling might have gone offline before a task -- cgit From a130e8fbc7de796eb6e680724d87f4737a26d0ac Mon Sep 17 00:00:00 2001 From: Josh Don Date: Fri, 27 Aug 2021 09:54:38 -0700 Subject: fs/proc/uptime.c: Fix idle time reporting in /proc/uptime /proc/uptime reports idle time by reading the CPUTIME_IDLE field from the per-cpu kcpustats. However, on NO_HZ systems, idle time is not continually updated on idle cpus, leading this value to appear incorrectly small. /proc/stat performs an accounting update when reading idle time; we can use the same approach for uptime. With this patch, /proc/stat and /proc/uptime now agree on idle time. Additionally, the following shows idle time tick up consistently on an idle machine: (while true; do cat /proc/uptime; sleep 1; done) | awk '{print $2-prev; prev=$2}' Reported-by: Luigi Rizzo Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Eric Dumazet Link: https://lkml.kernel.org/r/20210827165438.3280779-1-joshdon@google.com --- fs/proc/stat.c | 4 ++-- fs/proc/uptime.c | 14 +++++++++----- include/linux/kernel_stat.h | 1 + 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 6561a06ef905..4fb8729a68d4 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -24,7 +24,7 @@ #ifdef arch_idle_time -static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) +u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) { u64 idle; @@ -46,7 +46,7 @@ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) #else -static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) +u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) { u64 idle, idle_usecs = -1ULL; diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 5a1b228964fb..deb99bc9b7e6 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -12,18 +12,22 @@ static int uptime_proc_show(struct seq_file *m, void *v) { struct timespec64 uptime; struct timespec64 idle; - u64 nsec; + u64 idle_nsec; u32 rem; int i; - nsec = 0; - for_each_possible_cpu(i) - nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; + idle_nsec = 0; + for_each_possible_cpu(i) { + struct kernel_cpustat kcs; + + kcpustat_cpu_fetch(&kcs, i); + idle_nsec += get_idle_time(&kcs, i); + } ktime_get_boottime_ts64(&uptime); timens_add_boottime(&uptime); - idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); + idle.tv_sec = div_u64_rem(idle_nsec, NSEC_PER_SEC, &rem); idle.tv_nsec = rem; seq_printf(m, "%lu.%02lu %lu.%02lu\n", (unsigned long) uptime.tv_sec, diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 44ae1a7eb9e3..69ae6b278464 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -102,6 +102,7 @@ extern void account_system_index_time(struct task_struct *, u64, enum cpu_usage_stat); extern void account_steal_time(u64); extern void account_idle_time(u64); +extern u64 get_idle_time(struct kernel_cpustat *kcs, int cpu); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE static inline void account_process_tick(struct task_struct *tsk, int user) -- cgit From a480addecc0d89c200ec0b41da62ae8ceddca8d7 Mon Sep 17 00:00:00 2001 From: Josh Don Date: Thu, 19 Aug 2021 18:04:01 -0700 Subject: sched: Account number of SCHED_IDLE entities on each cfs_rq Adds cfs_rq->idle_nr_running, which accounts the number of idle entities directly enqueued on the cfs_rq. Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20210820010403.946838-3-joshdon@google.com --- kernel/sched/debug.c | 2 ++ kernel/sched/fair.c | 25 ++++++++++++++++++++++++- kernel/sched/sched.h | 1 + 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 17a653b67006..2e5fdd98dacc 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -614,6 +614,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); + SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", + cfs_rq->idle_nr_running); SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running", cfs_rq->idle_h_nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6cc958e0584d..9c78c16ef462 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2995,6 +2995,8 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #endif cfs_rq->nr_running++; + if (se_is_idle(se)) + cfs_rq->idle_nr_running++; } static void @@ -3008,6 +3010,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #endif cfs_rq->nr_running--; + if (se_is_idle(se)) + cfs_rq->idle_nr_running--; } /* @@ -5577,6 +5581,17 @@ static int sched_idle_rq(struct rq *rq) rq->nr_running); } +/* + * Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use + * of idle_nr_running, which does not consider idle descendants of normal + * entities. + */ +static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq) +{ + return cfs_rq->nr_running && + cfs_rq->nr_running == cfs_rq->idle_nr_running; +} + #ifdef CONFIG_SMP static int sched_idle_cpu(int cpu) { @@ -11575,7 +11590,7 @@ int sched_group_set_idle(struct task_group *tg, long idle) for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); struct sched_entity *se = tg->se[i]; - struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i]; + struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i]; bool was_idle = cfs_rq_is_idle(grp_cfs_rq); long idle_task_delta; struct rq_flags rf; @@ -11586,6 +11601,14 @@ int sched_group_set_idle(struct task_group *tg, long idle) if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq))) goto next_cpu; + if (se->on_rq) { + parent_cfs_rq = cfs_rq_of(se); + if (cfs_rq_is_idle(grp_cfs_rq)) + parent_cfs_rq->idle_nr_running++; + else + parent_cfs_rq->idle_nr_running--; + } + idle_task_delta = grp_cfs_rq->h_nr_running - grp_cfs_rq->idle_h_nr_running; if (!cfs_rq_is_idle(grp_cfs_rq)) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1fec3132d57e..f2965b558683 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -530,6 +530,7 @@ struct cfs_rq { struct load_weight load; unsigned int nr_running; unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int idle_nr_running; /* SCHED_IDLE */ unsigned int idle_h_nr_running; /* SCHED_IDLE */ u64 exec_clock; -- cgit From 51ce83ed523b00d58f2937ec014b12daaad55185 Mon Sep 17 00:00:00 2001 From: Josh Don Date: Thu, 19 Aug 2021 18:04:02 -0700 Subject: sched: reduce sched slice for SCHED_IDLE entities Use a small, non-scaled min granularity for SCHED_IDLE entities, when competing with normal entities. This reduces the latency of getting a normal entity back on cpu, at the expense of increased context switch frequency of SCHED_IDLE entities. The benefit of this change is to reduce the round-robin latency for normal entities when competing with a SCHED_IDLE entity. Example: on a machine with HZ=1000, spawned two threads, one of which is SCHED_IDLE, and affined to one cpu. Without this patch, the SCHED_IDLE thread runs for 4ms then waits for 1.4s. With this patch, it runs for 1ms and waits 340ms (as it round-robins with the other thread). Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20210820010403.946838-4-joshdon@google.com --- kernel/sched/debug.c | 2 ++ kernel/sched/fair.c | 29 ++++++++++++++++++++++++----- kernel/sched/sched.h | 1 + 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2e5fdd98dacc..34913a7f775a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -311,6 +311,7 @@ static __init int sched_init_debug(void) debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency); debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity); + debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity); debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity); debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); @@ -812,6 +813,7 @@ static void sched_debug_header(struct seq_file *m) SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) PN(sysctl_sched_latency); PN(sysctl_sched_min_granularity); + PN(sysctl_sched_idle_min_granularity); PN(sysctl_sched_wakeup_granularity); P(sysctl_sched_child_runs_first); P(sysctl_sched_features); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9c78c16ef462..d835061a9bd2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -59,6 +59,14 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; unsigned int sysctl_sched_min_granularity = 750000ULL; static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; +/* + * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks. + * Applies only when SCHED_IDLE tasks compete with normal tasks. + * + * (default: 0.75 msec) + */ +unsigned int sysctl_sched_idle_min_granularity = 750000ULL; + /* * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity */ @@ -665,6 +673,8 @@ static u64 __sched_period(unsigned long nr_running) return sysctl_sched_latency; } +static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq); + /* * We calculate the wall-time slice from the period by taking a part * proportional to the weight. @@ -674,6 +684,8 @@ static u64 __sched_period(unsigned long nr_running) static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned int nr_running = cfs_rq->nr_running; + struct sched_entity *init_se = se; + unsigned int min_gran; u64 slice; if (sched_feat(ALT_PERIOD)) @@ -684,12 +696,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) for_each_sched_entity(se) { struct load_weight *load; struct load_weight lw; + struct cfs_rq *qcfs_rq; - cfs_rq = cfs_rq_of(se); - load = &cfs_rq->load; + qcfs_rq = cfs_rq_of(se); + load = &qcfs_rq->load; if (unlikely(!se->on_rq)) { - lw = cfs_rq->load; + lw = qcfs_rq->load; update_load_add(&lw, se->load.weight); load = &lw; @@ -697,8 +710,14 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) slice = __calc_delta(slice, se->load.weight, load); } - if (sched_feat(BASE_SLICE)) - slice = max(slice, (u64)sysctl_sched_min_granularity); + if (sched_feat(BASE_SLICE)) { + if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq)) + min_gran = sysctl_sched_idle_min_granularity; + else + min_gran = sysctl_sched_min_granularity; + + slice = max_t(u64, slice, min_gran); + } return slice; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f2965b558683..15a8895698b9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2403,6 +2403,7 @@ extern const_debug unsigned int sysctl_sched_migration_cost; #ifdef CONFIG_SCHED_DEBUG extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; +extern unsigned int sysctl_sched_idle_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern int sysctl_resched_latency_warn_ms; extern int sysctl_resched_latency_warn_once; -- cgit From 2cae3948edd488ebdef4deaf1d1043f92f47e665 Mon Sep 17 00:00:00 2001 From: Josh Don Date: Thu, 19 Aug 2021 18:04:03 -0700 Subject: sched: adjust sleeper credit for SCHED_IDLE entities Give reduced sleeper credit to SCHED_IDLE entities. As a result, woken SCHED_IDLE entities will take longer to preempt normal entities. The benefit of this change is to make it less likely that a newly woken SCHED_IDLE entity will preempt a short-running normal entity before it blocks. We still give a small sleeper credit to SCHED_IDLE entities, so that idle<->idle competition retains some fairness. Example: With HZ=1000, spawned four threads affined to one cpu, one of which was set to SCHED_IDLE. Without this patch, wakeup latency for the SCHED_IDLE thread was ~1-2ms, with the patch the wakeup latency was ~5ms. Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: Jiang Biao Link: https://lore.kernel.org/r/20210820010403.946838-5-joshdon@google.com --- kernel/sched/fair.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d835061a9bd2..5457c8001c2a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4230,7 +4230,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) /* sleeps up to a single latency don't count. */ if (!initial) { - unsigned long thresh = sysctl_sched_latency; + unsigned long thresh; + + if (se_is_idle(se)) + thresh = sysctl_sched_min_granularity; + else + thresh = sysctl_sched_latency; /* * Halve their sleep time's effect, to allow -- cgit From bcb1704a1ed2de580a46f28922e223a65f16e0f5 Mon Sep 17 00:00:00 2001 From: Huaixin Chang Date: Mon, 30 Aug 2021 11:22:14 +0800 Subject: sched/fair: Add cfs bandwidth burst statistics Two new statistics are introduced to show the internal of burst feature and explain why burst helps or not. nr_bursts: number of periods bandwidth burst occurs burst_time: cumulative wall-time (in nanoseconds) that any cpus has used above quota in respective periods Co-developed-by: Shanpei Chen Signed-off-by: Shanpei Chen Co-developed-by: Tianchen Ding Signed-off-by: Tianchen Ding Signed-off-by: Huaixin Chang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Jordan Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20210830032215.16302-2-changhuaixin@linux.alibaba.com --- kernel/sched/core.c | 13 ++++++++++--- kernel/sched/fair.c | 9 +++++++++ kernel/sched/sched.h | 3 +++ 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f963c8113375..ccb604a1bd22 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10406,6 +10406,9 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v) seq_printf(sf, "wait_sum %llu\n", ws); } + seq_printf(sf, "nr_bursts %d\n", cfs_b->nr_burst); + seq_printf(sf, "burst_time %llu\n", cfs_b->burst_time); + return 0; } #endif /* CONFIG_CFS_BANDWIDTH */ @@ -10521,16 +10524,20 @@ static int cpu_extra_stat_show(struct seq_file *sf, { struct task_group *tg = css_tg(css); struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; - u64 throttled_usec; + u64 throttled_usec, burst_usec; throttled_usec = cfs_b->throttled_time; do_div(throttled_usec, NSEC_PER_USEC); + burst_usec = cfs_b->burst_time; + do_div(burst_usec, NSEC_PER_USEC); seq_printf(sf, "nr_periods %d\n" "nr_throttled %d\n" - "throttled_usec %llu\n", + "throttled_usec %llu\n" + "nr_bursts %d\n" + "burst_usec %llu\n", cfs_b->nr_periods, cfs_b->nr_throttled, - throttled_usec); + throttled_usec, cfs_b->nr_burst, burst_usec); } #endif return 0; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5457c8001c2a..fd41abef1950 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4715,11 +4715,20 @@ static inline u64 sched_cfs_bandwidth_slice(void) */ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) { + s64 runtime; + if (unlikely(cfs_b->quota == RUNTIME_INF)) return; cfs_b->runtime += cfs_b->quota; + runtime = cfs_b->runtime_snap - cfs_b->runtime; + if (runtime > 0) { + cfs_b->burst_time += runtime; + cfs_b->nr_burst++; + } + cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst); + cfs_b->runtime_snap = cfs_b->runtime; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 15a8895698b9..8712fc431417 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -369,6 +369,7 @@ struct cfs_bandwidth { u64 quota; u64 runtime; u64 burst; + u64 runtime_snap; s64 hierarchical_quota; u8 idle; @@ -381,7 +382,9 @@ struct cfs_bandwidth { /* Statistics: */ int nr_periods; int nr_throttled; + int nr_burst; u64 throttled_time; + u64 burst_time; #endif }; -- cgit From d73df887b6b8174dfbb7f5f878fbd1e0e2eb3f08 Mon Sep 17 00:00:00 2001 From: Huaixin Chang Date: Mon, 30 Aug 2021 11:22:15 +0800 Subject: sched/fair: Add document for burstable CFS bandwidth Basic description of usage and effect for CFS Bandwidth Control Burst. Co-developed-by: Shanpei Chen Signed-off-by: Shanpei Chen Co-developed-by: Tianchen Ding Signed-off-by: Tianchen Ding Signed-off-by: Huaixin Chang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Jordan Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20210830032215.16302-3-changhuaixin@linux.alibaba.com --- Documentation/admin-guide/cgroup-v2.rst | 8 ++++ Documentation/scheduler/sched-bwc.rst | 84 +++++++++++++++++++++++++++++---- 2 files changed, 83 insertions(+), 9 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index babbe04c8d37..d5b0e8aa043a 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1016,6 +1016,8 @@ All time durations are in microseconds. - nr_periods - nr_throttled - throttled_usec + - nr_bursts + - burst_usec cpu.weight A read-write single value file which exists on non-root @@ -1047,6 +1049,12 @@ All time durations are in microseconds. $PERIOD duration. "max" for $MAX indicates no limit. If only one number is written, $MAX is updated. + cpu.max.burst + A read-write single value file which exists on non-root + cgroups. The default is "0". + + The burst in the range [0, $MAX]. + cpu.pressure A read-write nested-keyed file. diff --git a/Documentation/scheduler/sched-bwc.rst b/Documentation/scheduler/sched-bwc.rst index 1fc73555f5c4..173c14110c85 100644 --- a/Documentation/scheduler/sched-bwc.rst +++ b/Documentation/scheduler/sched-bwc.rst @@ -22,9 +22,52 @@ cfs_quota units at each period boundary. As threads consume this bandwidth it is transferred to cpu-local "silos" on a demand basis. The amount transferred within each of these updates is tunable and described as the "slice". +Burst feature +------------- +This feature borrows time now against our future underrun, at the cost of +increased interference against the other system users. All nicely bounded. + +Traditional (UP-EDF) bandwidth control is something like: + + (U = \Sum u_i) <= 1 + +This guaranteeds both that every deadline is met and that the system is +stable. After all, if U were > 1, then for every second of walltime, +we'd have to run more than a second of program time, and obviously miss +our deadline, but the next deadline will be further out still, there is +never time to catch up, unbounded fail. + +The burst feature observes that a workload doesn't always executes the full +quota; this enables one to describe u_i as a statistical distribution. + +For example, have u_i = {x,e}_i, where x is the p(95) and x+e p(100) +(the traditional WCET). This effectively allows u to be smaller, +increasing the efficiency (we can pack more tasks in the system), but at +the cost of missing deadlines when all the odds line up. However, it +does maintain stability, since every overrun must be paired with an +underrun as long as our x is above the average. + +That is, suppose we have 2 tasks, both specify a p(95) value, then we +have a p(95)*p(95) = 90.25% chance both tasks are within their quota and +everything is good. At the same time we have a p(5)p(5) = 0.25% chance +both tasks will exceed their quota at the same time (guaranteed deadline +fail). Somewhere in between there's a threshold where one exceeds and +the other doesn't underrun enough to compensate; this depends on the +specific CDFs. + +At the same time, we can say that the worst case deadline miss, will be +\Sum e_i; that is, there is a bounded tardiness (under the assumption +that x+e is indeed WCET). + +The interferenece when using burst is valued by the possibilities for +missing the deadline and the average WCET. Test results showed that when +there many cgroups or CPU is under utilized, the interference is +limited. More details are shown in: +https://lore.kernel.org/lkml/5371BD36-55AE-4F71-B9D7-B86DC32E3D2B@linux.alibaba.com/ + Management ---------- -Quota and period are managed within the cpu subsystem via cgroupfs. +Quota, period and burst are managed within the cpu subsystem via cgroupfs. .. note:: The cgroupfs files described in this section are only applicable @@ -32,29 +75,37 @@ Quota and period are managed within the cpu subsystem via cgroupfs. :ref:`Documentation/admin-guide/cgroup-v2.rst `. - cpu.cfs_quota_us: the total available run-time within a period (in - microseconds) +- cpu.cfs_quota_us: run-time replenished within a period (in microseconds) - cpu.cfs_period_us: the length of a period (in microseconds) - cpu.stat: exports throttling statistics [explained further below] +- cpu.cfs_burst_us: the maximum accumulated run-time (in microseconds) The default values are:: cpu.cfs_period_us=100ms - cpu.cfs_quota=-1 + cpu.cfs_quota_us=-1 + cpu.cfs_burst_us=0 A value of -1 for cpu.cfs_quota_us indicates that the group does not have any bandwidth restriction in place, such a group is described as an unconstrained bandwidth group. This represents the traditional work-conserving behavior for CFS. -Writing any (valid) positive value(s) will enact the specified bandwidth limit. -The minimum quota allowed for the quota or period is 1ms. There is also an -upper bound on the period length of 1s. Additional restrictions exist when -bandwidth limits are used in a hierarchical fashion, these are explained in -more detail below. +Writing any (valid) positive value(s) no smaller than cpu.cfs_burst_us will +enact the specified bandwidth limit. The minimum quota allowed for the quota or +period is 1ms. There is also an upper bound on the period length of 1s. +Additional restrictions exist when bandwidth limits are used in a hierarchical +fashion, these are explained in more detail below. Writing any negative value to cpu.cfs_quota_us will remove the bandwidth limit and return the group to an unconstrained state once more. +A value of 0 for cpu.cfs_burst_us indicates that the group can not accumulate +any unused bandwidth. It makes the traditional bandwidth control behavior for +CFS unchanged. Writing any (valid) positive value(s) no larger than +cpu.cfs_quota_us into cpu.cfs_burst_us will enact the cap on unused bandwidth +accumulation. + Any updates to a group's bandwidth specification will result in it becoming unthrottled if it is in a constrained state. @@ -74,7 +125,7 @@ for more fine-grained consumption. Statistics ---------- -A group's bandwidth statistics are exported via 3 fields in cpu.stat. +A group's bandwidth statistics are exported via 5 fields in cpu.stat. cpu.stat: @@ -82,6 +133,9 @@ cpu.stat: - nr_throttled: Number of times the group has been throttled/limited. - throttled_time: The total time duration (in nanoseconds) for which entities of the group have been throttled. +- nr_bursts: Number of periods burst occurs. +- burst_time: Cumulative wall-time (in nanoseconds) that any CPUs has used + above quota in respective periods This interface is read-only. @@ -179,3 +233,15 @@ Examples By using a small period here we are ensuring a consistent latency response at the expense of burst capacity. + +4. Limit a group to 40% of 1 CPU, and allow accumulate up to 20% of 1 CPU + additionally, in case accumulation has been done. + + With 50ms period, 20ms quota will be equivalent to 40% of 1 CPU. + And 10ms burst will be equivalent to 20% of 1 CPU. + + # echo 20000 > cpu.cfs_quota_us /* quota = 20ms */ + # echo 50000 > cpu.cfs_period_us /* period = 50ms */ + # echo 10000 > cpu.cfs_burst_us /* burst = 10ms */ + + Larger buffer setting (no larger than quota) allows greater burst capacity. -- cgit From 1c36432b278cecf1499f21fae19836e614954309 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Thu, 2 Sep 2021 10:43:33 +0800 Subject: kselftests/sched: cleanup the child processes Previously, 'make -C sched run_tests' will block forever when it occurs something wrong where the *selftests framework* is waiting for its child processes to exit. [root@iaas-rpma sched]# ./cs_prctl_test ## Create a thread/process/process group hiearchy Not a core sched system tid=74985, / tgid=74985 / pgid=74985: ffffffffffffffff Not a core sched system tid=74986, / tgid=74986 / pgid=74985: ffffffffffffffff Not a core sched system tid=74988, / tgid=74986 / pgid=74985: ffffffffffffffff Not a core sched system tid=74989, / tgid=74986 / pgid=74985: ffffffffffffffff Not a core sched system tid=74990, / tgid=74986 / pgid=74985: ffffffffffffffff Not a core sched system tid=74987, / tgid=74987 / pgid=74985: ffffffffffffffff Not a core sched system tid=74991, / tgid=74987 / pgid=74985: ffffffffffffffff Not a core sched system tid=74992, / tgid=74987 / pgid=74985: ffffffffffffffff Not a core sched system tid=74993, / tgid=74987 / pgid=74985: ffffffffffffffff Not a core sched system (268) FAILED: get_cs_cookie(0) == 0 ## Set a cookie on entire process group -1 = prctl(62, 1, 0, 2, 0) core_sched create failed -- PGID: Invalid argument (cs_prctl_test.c:272) - [root@iaas-rpma sched]# ps PID TTY TIME CMD 4605 pts/2 00:00:00 bash 74986 pts/2 00:00:00 cs_prctl_test 74987 pts/2 00:00:00 cs_prctl_test 74999 pts/2 00:00:00 ps Reported-by: kernel test robot Signed-off-by: Li Zhijian Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chris Hyser Link: https://lore.kernel.org/r/20210902024333.75983-1-lizhijian@cn.fujitsu.com --- tools/testing/selftests/sched/cs_prctl_test.c | 28 +++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/sched/cs_prctl_test.c b/tools/testing/selftests/sched/cs_prctl_test.c index 7db9cf822dc7..8109b17dc764 100644 --- a/tools/testing/selftests/sched/cs_prctl_test.c +++ b/tools/testing/selftests/sched/cs_prctl_test.c @@ -62,6 +62,17 @@ enum pid_type {PIDTYPE_PID = 0, PIDTYPE_TGID, PIDTYPE_PGID}; const int THREAD_CLONE_FLAGS = CLONE_THREAD | CLONE_SIGHAND | CLONE_FS | CLONE_VM | CLONE_FILES; +struct child_args { + int num_threads; + int pfd[2]; + int cpid; + int thr_tids[MAX_THREADS]; +}; + +static struct child_args procs[MAX_PROCESSES]; +static int num_processes = 2; +static int need_cleanup = 0; + static int _prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { @@ -78,8 +89,14 @@ static int _prctl(int option, unsigned long arg2, unsigned long arg3, unsigned l #define handle_error(msg) __handle_error(__FILE__, __LINE__, msg) static void __handle_error(char *fn, int ln, char *msg) { + int pidx; printf("(%s:%d) - ", fn, ln); perror(msg); + if (need_cleanup) { + for (pidx = 0; pidx < num_processes; ++pidx) + kill(procs[pidx].cpid, 15); + need_cleanup = 0; + } exit(EXIT_FAILURE); } @@ -106,13 +123,6 @@ static unsigned long get_cs_cookie(int pid) return cookie; } -struct child_args { - int num_threads; - int pfd[2]; - int cpid; - int thr_tids[MAX_THREADS]; -}; - static int child_func_thread(void __attribute__((unused))*arg) { while (1) @@ -212,10 +222,7 @@ void _validate(int line, int val, char *msg) int main(int argc, char *argv[]) { - struct child_args procs[MAX_PROCESSES]; - int keypress = 0; - int num_processes = 2; int num_threads = 3; int delay = 0; int res = 0; @@ -262,6 +269,7 @@ int main(int argc, char *argv[]) printf("\n## Create a thread/process/process group hiearchy\n"); create_processes(num_processes, num_threads, procs); + need_cleanup = 1; disp_processes(num_processes, procs); validate(get_cs_cookie(0) == 0); -- cgit From a2dcb276ff9287fcea103ca1a2436383e8583751 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Sep 2021 14:35:40 +0000 Subject: sched/fair: Use __schedstat_set() in set_next_entity() schedstat_enabled() has been already checked, so we can use __schedstat_set() directly. Signed-off-by: Yafang Shao Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Link: https://lore.kernel.org/r/20210905143547.4668-2-laoar.shao@gmail.com --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fd41abef1950..61d3e3b97fe5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4530,9 +4530,9 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ if (schedstat_enabled() && rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) { - schedstat_set(se->statistics.slice_max, - max((u64)schedstat_val(se->statistics.slice_max), - se->sum_exec_runtime - se->prev_sum_exec_runtime)); + __schedstat_set(se->statistics.slice_max, + max((u64)se->statistics.slice_max, + se->sum_exec_runtime - se->prev_sum_exec_runtime)); } se->prev_sum_exec_runtime = se->sum_exec_runtime; -- cgit From ceeadb83aea28372e54857bf88ab7e17af48ab7b Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Sep 2021 14:35:41 +0000 Subject: sched: Make struct sched_statistics independent of fair sched class If we want to use the schedstats facility to trace other sched classes, we should make it independent of fair sched class. The struct sched_statistics is the schedular statistics of a task_struct or a task_group. So we can move it into struct task_struct and struct task_group to achieve the goal. After the patch, schestats are orgnized as follows, struct task_struct { ... struct sched_entity se; struct sched_rt_entity rt; struct sched_dl_entity dl; ... struct sched_statistics stats; ... }; Regarding the task group, schedstats is only supported for fair group sched, and a new struct sched_entity_stats is introduced, suggested by Peter - struct sched_entity_stats { struct sched_entity se; struct sched_statistics stats; } __no_randomize_layout; Then with the se in a task_group, we can easily get the stats. The sched_statistics members may be frequently modified when schedstats is enabled, in order to avoid impacting on random data which may in the same cacheline with them, the struct sched_statistics is defined as cacheline aligned. As this patch changes the core struct of scheduler, so I verified the performance it may impact on the scheduler with 'perf bench sched pipe', suggested by Mel. Below is the result, in which all the values are in usecs/op. Before After kernel.sched_schedstats=0 5.2~5.4 5.2~5.4 kernel.sched_schedstats=1 5.3~5.5 5.3~5.5 [These data is a little difference with the earlier version, that is because my old test machine is destroyed so I have to use a new different test machine.] Almost no impact on the sched performance. No functional change. [lkp@intel.com: reported build failure in earlier version] Signed-off-by: Yafang Shao Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Link: https://lore.kernel.org/r/20210905143547.4668-3-laoar.shao@gmail.com --- include/linux/sched.h | 6 ++-- kernel/sched/core.c | 25 +++++++------ kernel/sched/deadline.c | 4 +-- kernel/sched/debug.c | 92 +++++++++++++++++++++++++----------------------- kernel/sched/fair.c | 89 +++++++++++++++++++++++++++------------------- kernel/sched/rt.c | 4 +-- kernel/sched/stats.h | 19 ++++++++++ kernel/sched/stop_task.c | 4 +-- 8 files changed, 143 insertions(+), 100 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index c1a927ddec64..2bc4c72fec2d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -522,7 +522,7 @@ struct sched_statistics { u64 nr_wakeups_passive; u64 nr_wakeups_idle; #endif -}; +} ____cacheline_aligned; struct sched_entity { /* For load-balancing: */ @@ -538,8 +538,6 @@ struct sched_entity { u64 nr_migrations; - struct sched_statistics statistics; - #ifdef CONFIG_FAIR_GROUP_SCHED int depth; struct sched_entity *parent; @@ -803,6 +801,8 @@ struct task_struct { struct uclamp_se uclamp[UCLAMP_CNT]; #endif + struct sched_statistics stats; + #ifdef CONFIG_PREEMPT_NOTIFIERS /* List of struct preempt_notifier: */ struct hlist_head preempt_notifiers; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ccb604a1bd22..a1741e004eaa 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3489,11 +3489,11 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) #ifdef CONFIG_SMP if (cpu == rq->cpu) { __schedstat_inc(rq->ttwu_local); - __schedstat_inc(p->se.statistics.nr_wakeups_local); + __schedstat_inc(p->stats.nr_wakeups_local); } else { struct sched_domain *sd; - __schedstat_inc(p->se.statistics.nr_wakeups_remote); + __schedstat_inc(p->stats.nr_wakeups_remote); rcu_read_lock(); for_each_domain(rq->cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { @@ -3505,14 +3505,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } if (wake_flags & WF_MIGRATED) - __schedstat_inc(p->se.statistics.nr_wakeups_migrate); + __schedstat_inc(p->stats.nr_wakeups_migrate); #endif /* CONFIG_SMP */ __schedstat_inc(rq->ttwu_count); - __schedstat_inc(p->se.statistics.nr_wakeups); + __schedstat_inc(p->stats.nr_wakeups); if (wake_flags & WF_SYNC) - __schedstat_inc(p->se.statistics.nr_wakeups_sync); + __schedstat_inc(p->stats.nr_wakeups_sync); } /* @@ -4196,7 +4196,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_SCHEDSTATS /* Even if schedstat is disabled, there should not be garbage */ - memset(&p->se.statistics, 0, sizeof(p->se.statistics)); + memset(&p->stats, 0, sizeof(p->stats)); #endif RB_CLEAR_NODE(&p->dl.rb_node); @@ -9553,9 +9553,9 @@ void normalize_rt_tasks(void) continue; p->se.exec_start = 0; - schedstat_set(p->se.statistics.wait_start, 0); - schedstat_set(p->se.statistics.sleep_start, 0); - schedstat_set(p->se.statistics.block_start, 0); + schedstat_set(p->stats.wait_start, 0); + schedstat_set(p->stats.sleep_start, 0); + schedstat_set(p->stats.block_start, 0); if (!dl_task(p) && !rt_task(p)) { /* @@ -10397,11 +10397,14 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v) seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); if (schedstat_enabled() && tg != &root_task_group) { + struct sched_statistics *stats; u64 ws = 0; int i; - for_each_possible_cpu(i) - ws += schedstat_val(tg->se[i]->statistics.wait_sum); + for_each_possible_cpu(i) { + stats = __schedstats_from_se(tg->se[i]); + ws += schedstat_val(stats->wait_sum); + } seq_printf(sf, "wait_sum %llu\n", ws); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e94314633b39..51dd30990042 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1265,8 +1265,8 @@ static void update_curr_dl(struct rq *rq) return; } - schedstat_set(curr->se.statistics.exec_max, - max(curr->se.statistics.exec_max, delta_exec)); + schedstat_set(curr->stats.exec_max, + max(curr->stats.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 34913a7f775a..76fd38ea844c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -449,9 +449,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group struct sched_entity *se = tg->se[cpu]; #define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) -#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) +#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", \ + #F, (long long)schedstat_val(stats->F)) #define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) -#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) +#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", \ + #F, SPLIT_NS((long long)schedstat_val(stats->F))) if (!se) return; @@ -461,16 +463,18 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group PN(se->sum_exec_runtime); if (schedstat_enabled()) { - PN_SCHEDSTAT(se->statistics.wait_start); - PN_SCHEDSTAT(se->statistics.sleep_start); - PN_SCHEDSTAT(se->statistics.block_start); - PN_SCHEDSTAT(se->statistics.sleep_max); - PN_SCHEDSTAT(se->statistics.block_max); - PN_SCHEDSTAT(se->statistics.exec_max); - PN_SCHEDSTAT(se->statistics.slice_max); - PN_SCHEDSTAT(se->statistics.wait_max); - PN_SCHEDSTAT(se->statistics.wait_sum); - P_SCHEDSTAT(se->statistics.wait_count); + struct sched_statistics *stats = __schedstats_from_se(se); + + PN_SCHEDSTAT(wait_start); + PN_SCHEDSTAT(sleep_start); + PN_SCHEDSTAT(block_start); + PN_SCHEDSTAT(sleep_max); + PN_SCHEDSTAT(block_max); + PN_SCHEDSTAT(exec_max); + PN_SCHEDSTAT(slice_max); + PN_SCHEDSTAT(wait_max); + PN_SCHEDSTAT(wait_sum); + P_SCHEDSTAT(wait_count); } P(se->load.weight); @@ -537,9 +541,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) p->prio); SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)), + SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)), SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime))); + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime))); #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); @@ -958,8 +962,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, "---------------------------------------------------------" "----------\n"); -#define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->F)) -#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->F)) +#define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->stats.F)) +#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->stats.F)) PN(se.exec_start); PN(se.vruntime); @@ -972,33 +976,33 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, if (schedstat_enabled()) { u64 avg_atom, avg_per_cpu; - PN_SCHEDSTAT(se.statistics.sum_sleep_runtime); - PN_SCHEDSTAT(se.statistics.wait_start); - PN_SCHEDSTAT(se.statistics.sleep_start); - PN_SCHEDSTAT(se.statistics.block_start); - PN_SCHEDSTAT(se.statistics.sleep_max); - PN_SCHEDSTAT(se.statistics.block_max); - PN_SCHEDSTAT(se.statistics.exec_max); - PN_SCHEDSTAT(se.statistics.slice_max); - PN_SCHEDSTAT(se.statistics.wait_max); - PN_SCHEDSTAT(se.statistics.wait_sum); - P_SCHEDSTAT(se.statistics.wait_count); - PN_SCHEDSTAT(se.statistics.iowait_sum); - P_SCHEDSTAT(se.statistics.iowait_count); - P_SCHEDSTAT(se.statistics.nr_migrations_cold); - P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine); - P_SCHEDSTAT(se.statistics.nr_failed_migrations_running); - P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot); - P_SCHEDSTAT(se.statistics.nr_forced_migrations); - P_SCHEDSTAT(se.statistics.nr_wakeups); - P_SCHEDSTAT(se.statistics.nr_wakeups_sync); - P_SCHEDSTAT(se.statistics.nr_wakeups_migrate); - P_SCHEDSTAT(se.statistics.nr_wakeups_local); - P_SCHEDSTAT(se.statistics.nr_wakeups_remote); - P_SCHEDSTAT(se.statistics.nr_wakeups_affine); - P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts); - P_SCHEDSTAT(se.statistics.nr_wakeups_passive); - P_SCHEDSTAT(se.statistics.nr_wakeups_idle); + PN_SCHEDSTAT(sum_sleep_runtime); + PN_SCHEDSTAT(wait_start); + PN_SCHEDSTAT(sleep_start); + PN_SCHEDSTAT(block_start); + PN_SCHEDSTAT(sleep_max); + PN_SCHEDSTAT(block_max); + PN_SCHEDSTAT(exec_max); + PN_SCHEDSTAT(slice_max); + PN_SCHEDSTAT(wait_max); + PN_SCHEDSTAT(wait_sum); + P_SCHEDSTAT(wait_count); + PN_SCHEDSTAT(iowait_sum); + P_SCHEDSTAT(iowait_count); + P_SCHEDSTAT(nr_migrations_cold); + P_SCHEDSTAT(nr_failed_migrations_affine); + P_SCHEDSTAT(nr_failed_migrations_running); + P_SCHEDSTAT(nr_failed_migrations_hot); + P_SCHEDSTAT(nr_forced_migrations); + P_SCHEDSTAT(nr_wakeups); + P_SCHEDSTAT(nr_wakeups_sync); + P_SCHEDSTAT(nr_wakeups_migrate); + P_SCHEDSTAT(nr_wakeups_local); + P_SCHEDSTAT(nr_wakeups_remote); + P_SCHEDSTAT(nr_wakeups_affine); + P_SCHEDSTAT(nr_wakeups_affine_attempts); + P_SCHEDSTAT(nr_wakeups_passive); + P_SCHEDSTAT(nr_wakeups_idle); avg_atom = p->se.sum_exec_runtime; if (nr_switches) @@ -1064,7 +1068,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, void proc_sched_set_task(struct task_struct *p) { #ifdef CONFIG_SCHEDSTATS - memset(&p->se.statistics, 0, sizeof(p->se.statistics)); + memset(&p->stats, 0, sizeof(p->stats)); #endif } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 61d3e3b97fe5..97c6ecb03bb2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -856,8 +856,13 @@ static void update_curr(struct cfs_rq *cfs_rq) curr->exec_start = now; - schedstat_set(curr->statistics.exec_max, - max(delta_exec, curr->statistics.exec_max)); + if (schedstat_enabled()) { + struct sched_statistics *stats; + + stats = __schedstats_from_se(curr); + __schedstat_set(stats->exec_max, + max(delta_exec, stats->exec_max)); + } curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq->exec_clock, delta_exec); @@ -885,39 +890,45 @@ static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { u64 wait_start, prev_wait_start; + struct sched_statistics *stats; if (!schedstat_enabled()) return; + stats = __schedstats_from_se(se); + wait_start = rq_clock(rq_of(cfs_rq)); - prev_wait_start = schedstat_val(se->statistics.wait_start); + prev_wait_start = schedstat_val(stats->wait_start); if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && likely(wait_start > prev_wait_start)) wait_start -= prev_wait_start; - __schedstat_set(se->statistics.wait_start, wait_start); + __schedstat_set(stats->wait_start, wait_start); } static inline void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct task_struct *p; + struct sched_statistics *stats; + struct task_struct *p = NULL; u64 delta; if (!schedstat_enabled()) return; + stats = __schedstats_from_se(se); + /* * When the sched_schedstat changes from 0 to 1, some sched se * maybe already in the runqueue, the se->statistics.wait_start * will be 0.So it will let the delta wrong. We need to avoid this * scenario. */ - if (unlikely(!schedstat_val(se->statistics.wait_start))) + if (unlikely(!schedstat_val(stats->wait_start))) return; - delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); + delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(stats->wait_start); if (entity_is_task(se)) { p = task_of(se); @@ -927,30 +938,33 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) * time stamp can be adjusted to accumulate wait time * prior to migration. */ - __schedstat_set(se->statistics.wait_start, delta); + __schedstat_set(stats->wait_start, delta); return; } trace_sched_stat_wait(p, delta); } - __schedstat_set(se->statistics.wait_max, - max(schedstat_val(se->statistics.wait_max), delta)); - __schedstat_inc(se->statistics.wait_count); - __schedstat_add(se->statistics.wait_sum, delta); - __schedstat_set(se->statistics.wait_start, 0); + __schedstat_set(stats->wait_max, + max(schedstat_val(stats->wait_max), delta)); + __schedstat_inc(stats->wait_count); + __schedstat_add(stats->wait_sum, delta); + __schedstat_set(stats->wait_start, 0); } static inline void update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { + struct sched_statistics *stats; struct task_struct *tsk = NULL; u64 sleep_start, block_start; if (!schedstat_enabled()) return; - sleep_start = schedstat_val(se->statistics.sleep_start); - block_start = schedstat_val(se->statistics.block_start); + stats = __schedstats_from_se(se); + + sleep_start = schedstat_val(stats->sleep_start); + block_start = schedstat_val(stats->block_start); if (entity_is_task(se)) tsk = task_of(se); @@ -961,11 +975,11 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if ((s64)delta < 0) delta = 0; - if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) - __schedstat_set(se->statistics.sleep_max, delta); + if (unlikely(delta > schedstat_val(stats->sleep_max))) + __schedstat_set(stats->sleep_max, delta); - __schedstat_set(se->statistics.sleep_start, 0); - __schedstat_add(se->statistics.sum_sleep_runtime, delta); + __schedstat_set(stats->sleep_start, 0); + __schedstat_add(stats->sum_sleep_runtime, delta); if (tsk) { account_scheduler_latency(tsk, delta >> 10, 1); @@ -978,16 +992,16 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if ((s64)delta < 0) delta = 0; - if (unlikely(delta > schedstat_val(se->statistics.block_max))) - __schedstat_set(se->statistics.block_max, delta); + if (unlikely(delta > schedstat_val(stats->block_max))) + __schedstat_set(stats->block_max, delta); - __schedstat_set(se->statistics.block_start, 0); - __schedstat_add(se->statistics.sum_sleep_runtime, delta); + __schedstat_set(stats->block_start, 0); + __schedstat_add(stats->sum_sleep_runtime, delta); if (tsk) { if (tsk->in_iowait) { - __schedstat_add(se->statistics.iowait_sum, delta); - __schedstat_inc(se->statistics.iowait_count); + __schedstat_add(stats->iowait_sum, delta); + __schedstat_inc(stats->iowait_count); trace_sched_stat_iowait(tsk, delta); } @@ -1049,10 +1063,10 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* XXX racy against TTWU */ state = READ_ONCE(tsk->__state); if (state & TASK_INTERRUPTIBLE) - __schedstat_set(se->statistics.sleep_start, + __schedstat_set(tsk->stats.sleep_start, rq_clock(rq_of(cfs_rq))); if (state & TASK_UNINTERRUPTIBLE) - __schedstat_set(se->statistics.block_start, + __schedstat_set(tsk->stats.block_start, rq_clock(rq_of(cfs_rq))); } } @@ -4530,8 +4544,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ if (schedstat_enabled() && rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) { - __schedstat_set(se->statistics.slice_max, - max((u64)se->statistics.slice_max, + struct sched_statistics *stats; + + stats = __schedstats_from_se(se); + __schedstat_set(stats->slice_max, + max((u64)stats->slice_max, se->sum_exec_runtime - se->prev_sum_exec_runtime)); } @@ -6046,12 +6063,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits) target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); - schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); + schedstat_inc(p->stats.nr_wakeups_affine_attempts); if (target == nr_cpumask_bits) return prev_cpu; schedstat_inc(sd->ttwu_move_affine); - schedstat_inc(p->se.statistics.nr_wakeups_affine); + schedstat_inc(p->stats.nr_wakeups_affine); return target; } @@ -7855,7 +7872,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { int cpu; - schedstat_inc(p->se.statistics.nr_failed_migrations_affine); + schedstat_inc(p->stats.nr_failed_migrations_affine); env->flags |= LBF_SOME_PINNED; @@ -7889,7 +7906,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) env->flags &= ~LBF_ALL_PINNED; if (task_running(env->src_rq, p)) { - schedstat_inc(p->se.statistics.nr_failed_migrations_running); + schedstat_inc(p->stats.nr_failed_migrations_running); return 0; } @@ -7911,12 +7928,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) env->sd->nr_balance_failed > env->sd->cache_nice_tries) { if (tsk_cache_hot == 1) { schedstat_inc(env->sd->lb_hot_gained[env->idle]); - schedstat_inc(p->se.statistics.nr_forced_migrations); + schedstat_inc(p->stats.nr_forced_migrations); } return 1; } - schedstat_inc(p->se.statistics.nr_failed_migrations_hot); + schedstat_inc(p->stats.nr_failed_migrations_hot); return 0; } @@ -11457,7 +11474,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) if (!cfs_rq) goto err; - se = kzalloc_node(sizeof(struct sched_entity), + se = kzalloc_node(sizeof(struct sched_entity_stats), GFP_KERNEL, cpu_to_node(i)); if (!se) goto err_free_rq; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3daf42a0f462..95a7c3ad2dc3 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1009,8 +1009,8 @@ static void update_curr_rt(struct rq *rq) if (unlikely((s64)delta_exec <= 0)) return; - schedstat_set(curr->se.statistics.exec_max, - max(curr->se.statistics.exec_max, delta_exec)); + schedstat_set(curr->stats.exec_max, + max(curr->stats.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index d8f8eb0c655b..fb6022e860af 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -41,6 +41,7 @@ rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) #else /* !CONFIG_SCHEDSTATS: */ + static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { } static inline void rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) { } static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { } @@ -53,8 +54,26 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt # define schedstat_set(var, val) do { } while (0) # define schedstat_val(var) 0 # define schedstat_val_or_zero(var) 0 + #endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_FAIR_GROUP_SCHED +struct sched_entity_stats { + struct sched_entity se; + struct sched_statistics stats; +} __no_randomize_layout; +#endif + +static inline struct sched_statistics * +__schedstats_from_se(struct sched_entity *se) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + if (!entity_is_task(se)) + return &container_of(se, struct sched_entity_stats, se)->stats; +#endif + return &task_of(se)->stats; +} + #ifdef CONFIG_PSI /* * PSI tracks state that persists across sleeps, such as iowaits and diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index f988ebe3febb..0b165a25f22f 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -78,8 +78,8 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) if (unlikely((s64)delta_exec < 0)) delta_exec = 0; - schedstat_set(curr->se.statistics.exec_max, - max(curr->se.statistics.exec_max, delta_exec)); + schedstat_set(curr->stats.exec_max, + max(curr->stats.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); -- cgit From 60f2415e19d3948641149ac6aca137a7be1d1952 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Sep 2021 14:35:42 +0000 Subject: sched: Make schedstats helpers independent of fair sched class The original prototype of the schedstats helpers are update_stats_wait_*(struct cfs_rq *cfs_rq, struct sched_entity *se) The cfs_rq in these helpers is used to get the rq_clock, and the se is used to get the struct sched_statistics and the struct task_struct. In order to make these helpers available by all sched classes, we can pass the rq, sched_statistics and task_struct directly. Then the new helpers are update_stats_wait_*(struct rq *rq, struct task_struct *p, struct sched_statistics *stats) which are independent of fair sched class. To avoid vmlinux growing too large or introducing ovehead when !schedstat_enabled(), some new helpers after schedstat_enabled() are also introduced, Suggested by Mel. These helpers are in sched/stats.c, __update_stats_wait_*(struct rq *rq, struct task_struct *p, struct sched_statistics *stats) The size of vmlinux as follows, Before After Size of vmlinux 826308552 826304640 The size is a litte smaller as some functions are not inlined again after the change. I also compared the sched performance with 'perf bench sched pipe', suggested by Mel. The result as followsi (in usecs/op), Before After kernel.sched_schedstats=0 5.2~5.4 5.2~5.4 kernel.sched_schedstats=1 5.3~5.5 5.3~5.5 [These data is a little difference with the prev version, that is because my old test machine is destroyed so I have to use a new different test machine.] Almost no difference. No functional change. [lkp@intel.com: reported build failure in prev version] Signed-off-by: Yafang Shao Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Link: https://lore.kernel.org/r/20210905143547.4668-4-laoar.shao@gmail.com --- kernel/sched/fair.c | 134 ++++++++------------------------------------------- kernel/sched/stats.c | 103 +++++++++++++++++++++++++++++++++++++++ kernel/sched/stats.h | 30 ++++++++++++ 3 files changed, 152 insertions(+), 115 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 97c6ecb03bb2..71b30ef31110 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -887,32 +887,27 @@ static void update_curr_fair(struct rq *rq) } static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 wait_start, prev_wait_start; struct sched_statistics *stats; + struct task_struct *p = NULL; if (!schedstat_enabled()) return; stats = __schedstats_from_se(se); - wait_start = rq_clock(rq_of(cfs_rq)); - prev_wait_start = schedstat_val(stats->wait_start); - - if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && - likely(wait_start > prev_wait_start)) - wait_start -= prev_wait_start; + if (entity_is_task(se)) + p = task_of(se); - __schedstat_set(stats->wait_start, wait_start); + __update_stats_wait_start(rq_of(cfs_rq), p, stats); } static inline void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_statistics *stats; struct task_struct *p = NULL; - u64 delta; if (!schedstat_enabled()) return; @@ -928,105 +923,34 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) if (unlikely(!schedstat_val(stats->wait_start))) return; - delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(stats->wait_start); - - if (entity_is_task(se)) { + if (entity_is_task(se)) p = task_of(se); - if (task_on_rq_migrating(p)) { - /* - * Preserve migrating task's wait time so wait_start - * time stamp can be adjusted to accumulate wait time - * prior to migration. - */ - __schedstat_set(stats->wait_start, delta); - return; - } - trace_sched_stat_wait(p, delta); - } - __schedstat_set(stats->wait_max, - max(schedstat_val(stats->wait_max), delta)); - __schedstat_inc(stats->wait_count); - __schedstat_add(stats->wait_sum, delta); - __schedstat_set(stats->wait_start, 0); + __update_stats_wait_end(rq_of(cfs_rq), p, stats); } static inline void -update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_statistics *stats; struct task_struct *tsk = NULL; - u64 sleep_start, block_start; if (!schedstat_enabled()) return; stats = __schedstats_from_se(se); - sleep_start = schedstat_val(stats->sleep_start); - block_start = schedstat_val(stats->block_start); - if (entity_is_task(se)) tsk = task_of(se); - if (sleep_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > schedstat_val(stats->sleep_max))) - __schedstat_set(stats->sleep_max, delta); - - __schedstat_set(stats->sleep_start, 0); - __schedstat_add(stats->sum_sleep_runtime, delta); - - if (tsk) { - account_scheduler_latency(tsk, delta >> 10, 1); - trace_sched_stat_sleep(tsk, delta); - } - } - if (block_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - block_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > schedstat_val(stats->block_max))) - __schedstat_set(stats->block_max, delta); - - __schedstat_set(stats->block_start, 0); - __schedstat_add(stats->sum_sleep_runtime, delta); - - if (tsk) { - if (tsk->in_iowait) { - __schedstat_add(stats->iowait_sum, delta); - __schedstat_inc(stats->iowait_count); - trace_sched_stat_iowait(tsk, delta); - } - - trace_sched_stat_blocked(tsk, delta); - - /* - * Blocking time is in units of nanosecs, so shift by - * 20 to get a milliseconds-range estimation of the - * amount of time that the task spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - profile_hits(SLEEP_PROFILING, - (void *)get_wchan(tsk), - delta >> 20); - } - account_scheduler_latency(tsk, delta >> 10, 0); - } - } + __update_stats_enqueue_sleeper(rq_of(cfs_rq), tsk, stats); } /* * Task is being enqueued - update stats: */ static inline void -update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { if (!schedstat_enabled()) return; @@ -1036,14 +960,14 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * a dequeue/enqueue event is a NOP) */ if (se != cfs_rq->curr) - update_stats_wait_start(cfs_rq, se); + update_stats_wait_start_fair(cfs_rq, se); if (flags & ENQUEUE_WAKEUP) - update_stats_enqueue_sleeper(cfs_rq, se); + update_stats_enqueue_sleeper_fair(cfs_rq, se); } static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { if (!schedstat_enabled()) @@ -1054,7 +978,7 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * waiting task: */ if (se != cfs_rq->curr) - update_stats_wait_end(cfs_rq, se); + update_stats_wait_end_fair(cfs_rq, se); if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { struct task_struct *tsk = task_of(se); @@ -4267,26 +4191,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) static void check_enqueue_throttle(struct cfs_rq *cfs_rq); -static inline void check_schedstat_required(void) -{ -#ifdef CONFIG_SCHEDSTATS - if (schedstat_enabled()) - return; - - /* Force schedstat enabled if a dependent tracepoint is active */ - if (trace_sched_stat_wait_enabled() || - trace_sched_stat_sleep_enabled() || - trace_sched_stat_iowait_enabled() || - trace_sched_stat_blocked_enabled() || - trace_sched_stat_runtime_enabled()) { - printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " - "stat_blocked and stat_runtime require the " - "kernel parameter schedstats=enable or " - "kernel.sched_schedstats=1\n"); - } -#endif -} - static inline bool cfs_bandwidth_used(void); /* @@ -4360,7 +4264,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) place_entity(cfs_rq, se, 0); check_schedstat_required(); - update_stats_enqueue(cfs_rq, se, flags); + update_stats_enqueue_fair(cfs_rq, se, flags); check_spread(cfs_rq, se); if (!curr) __enqueue_entity(cfs_rq, se); @@ -4444,7 +4348,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_load_avg(cfs_rq, se, UPDATE_TG); se_update_runnable(se); - update_stats_dequeue(cfs_rq, se, flags); + update_stats_dequeue_fair(cfs_rq, se, flags); clear_buddies(cfs_rq, se); @@ -4529,7 +4433,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * a CPU. So account for the time it spent waiting on the * runqueue. */ - update_stats_wait_end(cfs_rq, se); + update_stats_wait_end_fair(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); } @@ -4631,7 +4535,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) check_spread(cfs_rq, prev); if (prev->on_rq) { - update_stats_wait_start(cfs_rq, prev); + update_stats_wait_start_fair(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 3f93fc3b5648..fad781ca7791 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -4,6 +4,109 @@ */ #include "sched.h" +void __update_stats_wait_start(struct rq *rq, struct task_struct *p, + struct sched_statistics *stats) +{ + u64 wait_start, prev_wait_start; + + wait_start = rq_clock(rq); + prev_wait_start = schedstat_val(stats->wait_start); + + if (p && likely(wait_start > prev_wait_start)) + wait_start -= prev_wait_start; + + __schedstat_set(stats->wait_start, wait_start); +} + +void __update_stats_wait_end(struct rq *rq, struct task_struct *p, + struct sched_statistics *stats) +{ + u64 delta = rq_clock(rq) - schedstat_val(stats->wait_start); + + if (p) { + if (task_on_rq_migrating(p)) { + /* + * Preserve migrating task's wait time so wait_start + * time stamp can be adjusted to accumulate wait time + * prior to migration. + */ + __schedstat_set(stats->wait_start, delta); + + return; + } + + trace_sched_stat_wait(p, delta); + } + + __schedstat_set(stats->wait_max, + max(schedstat_val(stats->wait_max), delta)); + __schedstat_inc(stats->wait_count); + __schedstat_add(stats->wait_sum, delta); + __schedstat_set(stats->wait_start, 0); +} + +void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, + struct sched_statistics *stats) +{ + u64 sleep_start, block_start; + + sleep_start = schedstat_val(stats->sleep_start); + block_start = schedstat_val(stats->block_start); + + if (sleep_start) { + u64 delta = rq_clock(rq) - sleep_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > schedstat_val(stats->sleep_max))) + __schedstat_set(stats->sleep_max, delta); + + __schedstat_set(stats->sleep_start, 0); + __schedstat_add(stats->sum_sleep_runtime, delta); + + if (p) { + account_scheduler_latency(p, delta >> 10, 1); + trace_sched_stat_sleep(p, delta); + } + } + + if (block_start) { + u64 delta = rq_clock(rq) - block_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > schedstat_val(stats->block_max))) + __schedstat_set(stats->block_max, delta); + + __schedstat_set(stats->block_start, 0); + __schedstat_add(stats->sum_sleep_runtime, delta); + + if (p) { + if (p->in_iowait) { + __schedstat_add(stats->iowait_sum, delta); + __schedstat_inc(stats->iowait_count); + trace_sched_stat_iowait(p, delta); + } + + trace_sched_stat_blocked(p, delta); + + /* + * Blocking time is in units of nanosecs, so shift by + * 20 to get a milliseconds-range estimation of the + * amount of time that the task spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + profile_hits(SLEEP_PROFILING, + (void *)get_wchan(p), + delta >> 20); + } + account_scheduler_latency(p, delta >> 10, 0); + } + } +} + /* * Current schedstat API version. * diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index fb6022e860af..cfb0893a83d4 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -2,6 +2,8 @@ #ifdef CONFIG_SCHEDSTATS +extern struct static_key_false sched_schedstats; + /* * Expects runqueue lock to be held for atomicity of update */ @@ -40,6 +42,29 @@ rq_sched_info_dequeue(struct rq *rq, unsigned long long delta) #define schedstat_val(var) (var) #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) +void __update_stats_wait_start(struct rq *rq, struct task_struct *p, + struct sched_statistics *stats); + +void __update_stats_wait_end(struct rq *rq, struct task_struct *p, + struct sched_statistics *stats); +void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, + struct sched_statistics *stats); + +static inline void +check_schedstat_required(void) +{ + if (schedstat_enabled()) + return; + + /* Force schedstat enabled if a dependent tracepoint is active */ + if (trace_sched_stat_wait_enabled() || + trace_sched_stat_sleep_enabled() || + trace_sched_stat_iowait_enabled() || + trace_sched_stat_blocked_enabled() || + trace_sched_stat_runtime_enabled()) + printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, stat_blocked and stat_runtime require the kernel parameter schedstats=enable or kernel.sched_schedstats=1\n"); +} + #else /* !CONFIG_SCHEDSTATS: */ static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { } @@ -55,6 +80,11 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt # define schedstat_val(var) 0 # define schedstat_val_or_zero(var) 0 +# define __update_stats_wait_start(rq, p, stats) do { } while (0) +# define __update_stats_wait_end(rq, p, stats) do { } while (0) +# define __update_stats_enqueue_sleeper(rq, p, stats) do { } while (0) +# define check_schedstat_required() do { } while (0) + #endif /* CONFIG_SCHEDSTATS */ #ifdef CONFIG_FAIR_GROUP_SCHED -- cgit From 847fc0cd0664fcb2a08ac66df6b85935361ec454 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Sep 2021 14:35:43 +0000 Subject: sched: Introduce task block time in schedstats Currently in schedstats we have sum_sleep_runtime and iowait_sum, but there's no metric to show how long the task is in D state. Once a task in D state, it means the task is blocked in the kernel, for example the task may be waiting for a mutex. The D state is more frequent than iowait, and it is more critital than S state. So it is worth to add a metric to measure it. Signed-off-by: Yafang Shao Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210905143547.4668-5-laoar.shao@gmail.com --- include/linux/sched.h | 2 ++ kernel/sched/debug.c | 6 ++++-- kernel/sched/stats.c | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 2bc4c72fec2d..193e16e2d0e4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -503,6 +503,8 @@ struct sched_statistics { u64 block_start; u64 block_max; + s64 sum_block_runtime; + u64 exec_max; u64 slice_max; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 76fd38ea844c..26fac5e28bc0 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -540,10 +540,11 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) (long long)(p->nvcsw + p->nivcsw), p->prio); - SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", + SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld %9lld.%06ld", SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)), SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime))); + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); @@ -977,6 +978,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, u64 avg_atom, avg_per_cpu; PN_SCHEDSTAT(sum_sleep_runtime); + PN_SCHEDSTAT(sum_block_runtime); PN_SCHEDSTAT(wait_start); PN_SCHEDSTAT(sleep_start); PN_SCHEDSTAT(block_start); diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index fad781ca7791..07dde2928c79 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -82,6 +82,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, __schedstat_set(stats->block_start, 0); __schedstat_add(stats->sum_sleep_runtime, delta); + __schedstat_add(stats->sum_block_runtime, delta); if (p) { if (p->in_iowait) { -- cgit From ed7b564cfdd0668efbd739d0b4e2d67797293f32 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Sep 2021 14:35:44 +0000 Subject: sched/rt: Support sched_stat_runtime tracepoint for RT sched class The runtime of a RT task has already been there, so we only need to add a tracepoint. One difference between fair task and RT task is that there is no vruntime in RT task. To reuse the sched_stat_runtime tracepoint, '0' is passed as vruntime for RT task. The output of this tracepoint for RT task as follows, stress-9748 [039] d.h. 113.519352: sched_stat_runtime: comm=stress pid=9748 runtime=997573 [ns] vruntime=0 [ns] stress-9748 [039] d.h. 113.520352: sched_stat_runtime: comm=stress pid=9748 runtime=997627 [ns] vruntime=0 [ns] stress-9748 [039] d.h. 113.521352: sched_stat_runtime: comm=stress pid=9748 runtime=998203 [ns] vruntime=0 [ns] Signed-off-by: Yafang Shao Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210905143547.4668-6-laoar.shao@gmail.com --- kernel/sched/rt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 95a7c3ad2dc3..5d251112e51c 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1012,6 +1012,8 @@ static void update_curr_rt(struct rq *rq) schedstat_set(curr->stats.exec_max, max(curr->stats.exec_max, delta_exec)); + trace_sched_stat_runtime(curr, delta_exec, 0); + curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); -- cgit From 57a5c2dafca8e3ce4f70e975a9c7727b66b5071f Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Sep 2021 14:35:45 +0000 Subject: sched/rt: Support schedstats for RT sched class We want to measure the latency of RT tasks in our production environment with schedstats facility, but currently schedstats is only supported for fair sched class. This patch enable it for RT sched class as well. After we make the struct sched_statistics and the helpers of it independent of fair sched class, we can easily use the schedstats facility for RT sched class. The schedstat usage in RT sched class is similar with fair sched class, for example, fair RT enqueue update_stats_enqueue_fair update_stats_enqueue_rt dequeue update_stats_dequeue_fair update_stats_dequeue_rt put_prev_task update_stats_wait_start update_stats_wait_start_rt set_next_task update_stats_wait_end update_stats_wait_end_rt The user can get the schedstats information in the same way in fair sched class. For example, fair RT /proc/[pid]/sched /proc/[pid]/sched schedstats is not supported for RT group. The output of a RT task's schedstats as follows, $ cat /proc/10349/sched ... sum_sleep_runtime : 972.434535 sum_block_runtime : 960.433522 wait_start : 188510.871584 sleep_start : 0.000000 block_start : 0.000000 sleep_max : 12.001013 block_max : 952.660622 exec_max : 0.049629 slice_max : 0.000000 wait_max : 0.018538 wait_sum : 0.424340 wait_count : 49 iowait_sum : 956.495640 iowait_count : 24 nr_migrations_cold : 0 nr_failed_migrations_affine : 0 nr_failed_migrations_running : 0 nr_failed_migrations_hot : 0 nr_forced_migrations : 0 nr_wakeups : 49 nr_wakeups_sync : 0 nr_wakeups_migrate : 0 nr_wakeups_local : 49 nr_wakeups_remote : 0 nr_wakeups_affine : 0 nr_wakeups_affine_attempts : 0 nr_wakeups_passive : 0 nr_wakeups_idle : 0 ... The sched:sched_stat_{wait, sleep, iowait, blocked} tracepoints can be used to trace RT tasks as well. The output of these tracepoints for a RT tasks as follows, - runtime stress-10352 [004] d.h. 1035.382286: sched_stat_runtime: comm=stress pid=10352 runtime=995769 [ns] vruntime=0 [ns] [vruntime=0 means it is a RT task] - wait -0 [004] dN.. 1227.688544: sched_stat_wait: comm=stress pid=10352 delay=46849882 [ns] - blocked kworker/4:1-465 [004] dN.. 1585.676371: sched_stat_blocked: comm=stress pid=17194 delay=189963 [ns] - iowait kworker/4:1-465 [004] dN.. 1585.675330: sched_stat_iowait: comm=stress pid=17189 delay=182848 [ns] - sleep sleep-18194 [023] dN.. 1780.891840: sched_stat_sleep: comm=sleep.sh pid=17767 delay=1001160770 [ns] sleep-18196 [023] dN.. 1781.893208: sched_stat_sleep: comm=sleep.sh pid=17767 delay=1001161970 [ns] sleep-18197 [023] dN.. 1782.894544: sched_stat_sleep: comm=sleep.sh pid=17767 delay=1001128840 [ns] [ In sleep.sh, it sleeps 1 sec each time. ] [lkp@intel.com: reported build failure in earlier version] Signed-off-by: Yafang Shao Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210905143547.4668-7-laoar.shao@gmail.com --- kernel/sched/rt.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 5d251112e51c..bb945f8faeca 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1273,6 +1273,112 @@ static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_arr rt_se->on_list = 0; } +static inline struct sched_statistics * +__schedstats_from_rt_se(struct sched_rt_entity *rt_se) +{ +#ifdef CONFIG_RT_GROUP_SCHED + /* schedstats is not supported for rt group. */ + if (!rt_entity_is_task(rt_se)) + return NULL; +#endif + + return &rt_task_of(rt_se)->stats; +} + +static inline void +update_stats_wait_start_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) +{ + struct sched_statistics *stats; + struct task_struct *p = NULL; + + if (!schedstat_enabled()) + return; + + if (rt_entity_is_task(rt_se)) + p = rt_task_of(rt_se); + + stats = __schedstats_from_rt_se(rt_se); + if (!stats) + return; + + __update_stats_wait_start(rq_of_rt_rq(rt_rq), p, stats); +} + +static inline void +update_stats_enqueue_sleeper_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) +{ + struct sched_statistics *stats; + struct task_struct *p = NULL; + + if (!schedstat_enabled()) + return; + + if (rt_entity_is_task(rt_se)) + p = rt_task_of(rt_se); + + stats = __schedstats_from_rt_se(rt_se); + if (!stats) + return; + + __update_stats_enqueue_sleeper(rq_of_rt_rq(rt_rq), p, stats); +} + +static inline void +update_stats_enqueue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, + int flags) +{ + if (!schedstat_enabled()) + return; + + if (flags & ENQUEUE_WAKEUP) + update_stats_enqueue_sleeper_rt(rt_rq, rt_se); +} + +static inline void +update_stats_wait_end_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) +{ + struct sched_statistics *stats; + struct task_struct *p = NULL; + + if (!schedstat_enabled()) + return; + + if (rt_entity_is_task(rt_se)) + p = rt_task_of(rt_se); + + stats = __schedstats_from_rt_se(rt_se); + if (!stats) + return; + + __update_stats_wait_end(rq_of_rt_rq(rt_rq), p, stats); +} + +static inline void +update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, + int flags) +{ + struct task_struct *p = NULL; + + if (!schedstat_enabled()) + return; + + if (rt_entity_is_task(rt_se)) + p = rt_task_of(rt_se); + + if ((flags & DEQUEUE_SLEEP) && p) { + unsigned int state; + + state = READ_ONCE(p->__state); + if (state & TASK_INTERRUPTIBLE) + __schedstat_set(p->stats.sleep_start, + rq_clock(rq_of_rt_rq(rt_rq))); + + if (state & TASK_UNINTERRUPTIBLE) + __schedstat_set(p->stats.block_start, + rq_clock(rq_of_rt_rq(rt_rq))); + } +} + static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); @@ -1346,6 +1452,8 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rq *rq = rq_of_rt_se(rt_se); + update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags); + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) __enqueue_rt_entity(rt_se, flags); @@ -1356,6 +1464,8 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rq *rq = rq_of_rt_se(rt_se); + update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags); + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) { @@ -1378,6 +1488,9 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (flags & ENQUEUE_WAKEUP) rt_se->timeout = 0; + check_schedstat_required(); + update_stats_wait_start_rt(rt_rq_of_se(rt_se), rt_se); + enqueue_rt_entity(rt_se, flags); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) @@ -1578,7 +1691,12 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first) { + struct sched_rt_entity *rt_se = &p->rt; + struct rt_rq *rt_rq = &rq->rt; + p->se.exec_start = rq_clock_task(rq); + if (on_rt_rq(&p->rt)) + update_stats_wait_end_rt(rt_rq, rt_se); /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); @@ -1652,6 +1770,12 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) static void put_prev_task_rt(struct rq *rq, struct task_struct *p) { + struct sched_rt_entity *rt_se = &p->rt; + struct rt_rq *rt_rq = &rq->rt; + + if (on_rt_rq(&p->rt)) + update_stats_wait_start_rt(rt_rq, rt_se); + update_curr_rt(rq); update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); -- cgit From 95fd58e8dadb7aa707628a2187c626bb897c49ec Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Sep 2021 14:35:46 +0000 Subject: sched/dl: Support sched_stat_runtime tracepoint for deadline sched class The runtime of a DL task has already been there, so we only need to add a tracepoint. One difference between fair task and DL task is that there is no vruntime in dl task. To reuse the sched_stat_runtime tracepoint, '0' is passed as vruntime for DL task. The output of this tracepoint for DL task as follows, top-36462 [047] d.h. 6083.452103: sched_stat_runtime: comm=top pid=36462 runtime=409898 [ns] vruntime=0 [ns] Signed-off-by: Yafang Shao Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210905143547.4668-8-laoar.shao@gmail.com --- kernel/sched/deadline.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 51dd30990042..73fb33e1868f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1268,6 +1268,8 @@ static void update_curr_dl(struct rq *rq) schedstat_set(curr->stats.exec_max, max(curr->stats.exec_max, delta_exec)); + trace_sched_stat_runtime(curr, delta_exec, 0); + curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); -- cgit From b5eb4a5f6521d58d5564b8746701bd67a92a2b11 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Sep 2021 14:35:47 +0000 Subject: sched/dl: Support schedstats for deadline sched class After we make the struct sched_statistics and the helpers of it independent of fair sched class, we can easily use the schedstats facility for deadline sched class. The schedstat usage in DL sched class is similar with fair sched class, for example, fair deadline enqueue update_stats_enqueue_fair update_stats_enqueue_dl dequeue update_stats_dequeue_fair update_stats_dequeue_dl put_prev_task update_stats_wait_start update_stats_wait_start_dl set_next_task update_stats_wait_end update_stats_wait_end_dl The user can get the schedstats information in the same way in fair sched class. For example, fair deadline /proc/[pid]/sched /proc/[pid]/sched The output of a deadline task's schedstats as follows, $ cat /proc/69662/sched ... se.sum_exec_runtime : 3067.696449 se.nr_migrations : 0 sum_sleep_runtime : 720144.029661 sum_block_runtime : 0.547853 wait_start : 0.000000 sleep_start : 14131540.828955 block_start : 0.000000 sleep_max : 2999.974045 block_max : 0.283637 exec_max : 1.000269 slice_max : 0.000000 wait_max : 0.002217 wait_sum : 0.762179 wait_count : 733 iowait_sum : 0.547853 iowait_count : 3 nr_migrations_cold : 0 nr_failed_migrations_affine : 0 nr_failed_migrations_running : 0 nr_failed_migrations_hot : 0 nr_forced_migrations : 0 nr_wakeups : 246 nr_wakeups_sync : 2 nr_wakeups_migrate : 0 nr_wakeups_local : 244 nr_wakeups_remote : 2 nr_wakeups_affine : 0 nr_wakeups_affine_attempts : 0 nr_wakeups_passive : 0 nr_wakeups_idle : 0 ... The sched:sched_stat_{wait, sleep, iowait, blocked} tracepoints can be used to trace deadlline tasks as well. Signed-off-by: Yafang Shao Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210905143547.4668-9-laoar.shao@gmail.com --- kernel/sched/deadline.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 73fb33e1868f..d2c072b0ef01 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1474,6 +1474,82 @@ static inline bool __dl_less(struct rb_node *a, const struct rb_node *b) return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline); } +static inline struct sched_statistics * +__schedstats_from_dl_se(struct sched_dl_entity *dl_se) +{ + return &dl_task_of(dl_se)->stats; +} + +static inline void +update_stats_wait_start_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) +{ + struct sched_statistics *stats; + + if (!schedstat_enabled()) + return; + + stats = __schedstats_from_dl_se(dl_se); + __update_stats_wait_start(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); +} + +static inline void +update_stats_wait_end_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) +{ + struct sched_statistics *stats; + + if (!schedstat_enabled()) + return; + + stats = __schedstats_from_dl_se(dl_se); + __update_stats_wait_end(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); +} + +static inline void +update_stats_enqueue_sleeper_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) +{ + struct sched_statistics *stats; + + if (!schedstat_enabled()) + return; + + stats = __schedstats_from_dl_se(dl_se); + __update_stats_enqueue_sleeper(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); +} + +static inline void +update_stats_enqueue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, + int flags) +{ + if (!schedstat_enabled()) + return; + + if (flags & ENQUEUE_WAKEUP) + update_stats_enqueue_sleeper_dl(dl_rq, dl_se); +} + +static inline void +update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, + int flags) +{ + struct task_struct *p = dl_task_of(dl_se); + + if (!schedstat_enabled()) + return; + + if ((flags & DEQUEUE_SLEEP)) { + unsigned int state; + + state = READ_ONCE(p->__state); + if (state & TASK_INTERRUPTIBLE) + __schedstat_set(p->stats.sleep_start, + rq_clock(rq_of_dl_rq(dl_rq))); + + if (state & TASK_UNINTERRUPTIBLE) + __schedstat_set(p->stats.block_start, + rq_clock(rq_of_dl_rq(dl_rq))); + } +} + static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); @@ -1504,6 +1580,8 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) { BUG_ON(on_dl_rq(dl_se)); + update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags); + /* * If this is a wakeup or a new instance, the scheduling * parameters of the task might need updating. Otherwise, @@ -1600,6 +1678,9 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) return; } + check_schedstat_required(); + update_stats_wait_start_dl(dl_rq_of_se(&p->dl), &p->dl); + enqueue_dl_entity(&p->dl, flags); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) @@ -1608,6 +1689,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { + update_stats_dequeue_dl(&rq->dl, &p->dl, flags); dequeue_dl_entity(&p->dl); dequeue_pushable_dl_task(rq, p); } @@ -1827,7 +1909,12 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p) static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) { + struct sched_dl_entity *dl_se = &p->dl; + struct dl_rq *dl_rq = &rq->dl; + p->se.exec_start = rq_clock_task(rq); + if (on_dl_rq(&p->dl)) + update_stats_wait_end_dl(dl_rq, dl_se); /* You can't push away the running task */ dequeue_pushable_dl_task(rq, p); @@ -1884,6 +1971,12 @@ static struct task_struct *pick_next_task_dl(struct rq *rq) static void put_prev_task_dl(struct rq *rq, struct task_struct *p) { + struct sched_dl_entity *dl_se = &p->dl; + struct dl_rq *dl_rq = &rq->dl; + + if (on_dl_rq(&p->dl)) + update_stats_wait_start_dl(dl_rq, dl_se); + update_curr_dl(rq); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); -- cgit From 32ed980c3020b7a19e26dc488c10817807ba2a41 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 14 Sep 2021 17:52:44 +0800 Subject: sched: Remove unused inline function __rq_clock_broken() These is no caller in tree since commit 523e979d3164 ("sched/core: Use PELT for scale_rt_capacity()") Signed-off-by: YueHaibing Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210914095244.52780-1-yuehaibing@huawei.com --- kernel/sched/sched.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8712fc431417..198058bb54d7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1426,11 +1426,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) extern void update_rq_clock(struct rq *rq); -static inline u64 __rq_clock_broken(struct rq *rq) -{ - return READ_ONCE(rq->clock); -} - /* * rq::clock_update_flags bits * -- cgit From c597bfddc9e9e8a63817252b67c3ca0e544ace26 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 14 Sep 2021 12:31:34 +0200 Subject: sched: Provide Kconfig support for default dynamic preempt mode Currently the boot defined preempt behaviour (aka dynamic preempt) selects full preemption by default when the "preempt=" boot parameter is omitted. However distros may rather want to default to either no preemption or voluntary preemption. To provide with this flexibility, make dynamic preemption a visible Kconfig option and adapt the preemption behaviour selected by the user to either static or dynamic preemption. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210914103134.11309-1-frederic@kernel.org --- kernel/Kconfig.preempt | 32 +++++++++++++++++++++++--------- kernel/sched/core.c | 29 ++++++++++++++++++++++++++--- 2 files changed, 49 insertions(+), 12 deletions(-) diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 5876e30c5740..60f1bfc3c7b2 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -2,10 +2,11 @@ choice prompt "Preemption Model" - default PREEMPT_NONE + default PREEMPT_NONE_BEHAVIOUR -config PREEMPT_NONE +config PREEMPT_NONE_BEHAVIOUR bool "No Forced Preemption (Server)" + select PREEMPT_NONE if !PREEMPT_DYNAMIC help This is the traditional Linux preemption model, geared towards throughput. It will still provide good latencies most of the @@ -17,9 +18,10 @@ config PREEMPT_NONE raw processing power of the kernel, irrespective of scheduling latencies. -config PREEMPT_VOLUNTARY +config PREEMPT_VOLUNTARY_BEHAVIOUR bool "Voluntary Kernel Preemption (Desktop)" depends on !ARCH_NO_PREEMPT + select PREEMPT_VOLUNTARY if !PREEMPT_DYNAMIC help This option reduces the latency of the kernel by adding more "explicit preemption points" to the kernel code. These new @@ -35,12 +37,10 @@ config PREEMPT_VOLUNTARY Select this if you are building a kernel for a desktop system. -config PREEMPT +config PREEMPT_BEHAVIOUR bool "Preemptible Kernel (Low-Latency Desktop)" depends on !ARCH_NO_PREEMPT - select PREEMPTION - select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK - select PREEMPT_DYNAMIC if HAVE_PREEMPT_DYNAMIC + select PREEMPT help This option reduces the latency of the kernel by making all kernel code (that is not executing in a critical section) @@ -58,7 +58,7 @@ config PREEMPT config PREEMPT_RT bool "Fully Preemptible Kernel (Real-Time)" - depends on EXPERT && ARCH_SUPPORTS_RT + depends on EXPERT && ARCH_SUPPORTS_RT && !PREEMPT_DYNAMIC select PREEMPTION help This option turns the kernel into a real-time kernel by replacing @@ -75,6 +75,17 @@ config PREEMPT_RT endchoice +config PREEMPT_NONE + bool + +config PREEMPT_VOLUNTARY + bool + +config PREEMPT + bool + select PREEMPTION + select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK + config PREEMPT_COUNT bool @@ -83,7 +94,10 @@ config PREEMPTION select PREEMPT_COUNT config PREEMPT_DYNAMIC - bool + bool "Preemption behaviour defined on boot" + depends on HAVE_PREEMPT_DYNAMIC + select PREEMPT + default y help This option allows to define the preemption model on the kernel command line parameter and thus override the default preemption diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a1741e004eaa..95f4e16f98a2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6520,12 +6520,13 @@ EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); */ enum { - preempt_dynamic_none = 0, + preempt_dynamic_undefined = -1, + preempt_dynamic_none, preempt_dynamic_voluntary, preempt_dynamic_full, }; -int preempt_dynamic_mode = preempt_dynamic_full; +int preempt_dynamic_mode = preempt_dynamic_undefined; int sched_dynamic_mode(const char *str) { @@ -6598,7 +6599,27 @@ static int __init setup_preempt_mode(char *str) } __setup("preempt=", setup_preempt_mode); -#endif /* CONFIG_PREEMPT_DYNAMIC */ +static void __init preempt_dynamic_init(void) +{ + if (preempt_dynamic_mode == preempt_dynamic_undefined) { + if (IS_ENABLED(CONFIG_PREEMPT_NONE_BEHAVIOUR)) { + sched_dynamic_update(preempt_dynamic_none); + } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY_BEHAVIOUR)) { + sched_dynamic_update(preempt_dynamic_voluntary); + } else { + /* Default static call setting, nothing to do */ + WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_BEHAVIOUR)); + preempt_dynamic_mode = preempt_dynamic_full; + pr_info("Dynamic Preempt: full\n"); + } + } +} + +#else /* !CONFIG_PREEMPT_DYNAMIC */ + +static inline void preempt_dynamic_init(void) { } + +#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */ /* * This is the entry point to schedule() from kernel preemption @@ -9398,6 +9419,8 @@ void __init sched_init(void) init_uclamp(); + preempt_dynamic_init(); + scheduler_running = 1; } -- cgit From 1a7243ca4074beed97b68d7235a6e34862fc2cd6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 10 Nov 2020 12:38:47 +0100 Subject: kthread: Move prio/affinite change into the newly created thread With enabled threaded interrupts the nouveau driver reported the following: | Chain exists of: | &mm->mmap_lock#2 --> &device->mutex --> &cpuset_rwsem | | Possible unsafe locking scenario: | | CPU0 CPU1 | ---- ---- | lock(&cpuset_rwsem); | lock(&device->mutex); | lock(&cpuset_rwsem); | lock(&mm->mmap_lock#2); The device->mutex is nvkm_device::mutex. Unblocking the lockchain at `cpuset_rwsem' is probably the easiest thing to do. Move the priority reset to the start of the newly created thread. Fixes: 710da3c8ea7df ("sched/core: Prevent race condition between cpuset and __sched_setscheduler()") Reported-by: Mike Galbraith Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/a23a826af7c108ea5651e73b8fbae5e653f16e86.camel@gmx.de --- kernel/kthread.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/kthread.c b/kernel/kthread.c index 5b37a8567168..4a4d7092a2d8 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -270,6 +270,7 @@ EXPORT_SYMBOL_GPL(kthread_parkme); static int kthread(void *_create) { + static const struct sched_param param = { .sched_priority = 0 }; /* Copy data: it's on kthread's stack */ struct kthread_create_info *create = _create; int (*threadfn)(void *data) = create->threadfn; @@ -300,6 +301,13 @@ static int kthread(void *_create) init_completion(&self->parked); current->vfork_done = &self->exited; + /* + * The new thread inherited kthreadd's priority and CPU mask. Reset + * back to default in case they have been changed. + */ + sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); + set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD)); + /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; @@ -397,7 +405,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), } task = create->result; if (!IS_ERR(task)) { - static const struct sched_param param = { .sched_priority = 0 }; char name[TASK_COMM_LEN]; /* @@ -406,13 +413,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), */ vsnprintf(name, sizeof(name), namefmt, args); set_task_comm(task, name); - /* - * root may have changed our (kthreadd's) priority or CPU mask. - * The kernel thread should not inherit these properties. - */ - sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); - set_cpus_allowed_ptr(task, - housekeeping_cpumask(HK_FLAG_KTHREAD)); } kfree(create); return task; -- cgit From 183b8ec38f1ec6c1f8419375303bf1d09a2b8369 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 10 Sep 2021 18:18:14 -0700 Subject: x86/sched: Decrease further the priorities of SMT siblings When scheduling, it is better to prefer a separate physical core rather than the SMT sibling of a high priority core. The existing formula to compute priorities takes such fact in consideration. There may exist, however, combinations of priorities (i.e., maximum frequencies) in which the priority of high-numbered SMT siblings of high-priority cores collides with the priority of low-numbered SMT siblings of low-priority cores. Consider for instance an SMT2 system with CPUs [0, 1] with priority 60 and [2, 3] with priority 30(CPUs in brackets are SMT siblings. In such a case, the resulting priorities would be [120, 60], [60, 30]. Thus, to ensure that CPU2 has higher priority than CPU1, divide the raw priority by the squared SMT iterator. The resulting priorities are [120, 30]. [60, 15]. Originally-by: Len Brown Signed-off-by: Len Brown Signed-off-by: Ricardo Neri Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210911011819.12184-2-ricardo.neri-calderon@linux.intel.com --- arch/x86/kernel/itmt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 1afbdd1dd777..9ff480e94511 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -198,7 +198,7 @@ void sched_set_itmt_core_prio(int prio, int core_cpu) * of the priority chain and only used when * all other high priority cpus are out of capacity. */ - smt_prio = prio * smp_num_siblings / i; + smt_prio = prio * smp_num_siblings / (i * i); per_cpu(sched_core_priority, cpu) = smt_prio; i++; } -- cgit From 16d364ba6ef2aa59b409df70682770f3ed23f7c0 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 10 Sep 2021 18:18:15 -0700 Subject: sched/topology: Introduce sched_group::flags There exist situations in which the load balance needs to know the properties of the CPUs in a scheduling group. When using asymmetric packing, for instance, the load balancer needs to know not only the state of dst_cpu but also of its SMT siblings, if any. Use the flags of the child scheduling domains to initialize scheduling group flags. This will reflect the properties of the CPUs in the group. A subsequent changeset will make use of these new flags. No functional changes are introduced. Originally-by: Peter Zijlstra (Intel) Signed-off-by: Ricardo Neri Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Joel Fernandes (Google) Reviewed-by: Len Brown Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210911011819.12184-3-ricardo.neri-calderon@linux.intel.com --- kernel/sched/sched.h | 1 + kernel/sched/topology.c | 21 ++++++++++++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 198058bb54d7..e5c4d4ddd83a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1808,6 +1808,7 @@ struct sched_group { unsigned int group_weight; struct sched_group_capacity *sgc; int asym_prefer_cpu; /* CPU of highest priority in group */ + int flags; /* * The CPUs this group covers. diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 4e8698e62f07..c56faae461d9 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -716,8 +716,20 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) tmp = sd; sd = sd->parent; destroy_sched_domain(tmp); - if (sd) + if (sd) { + struct sched_group *sg = sd->groups; + + /* + * sched groups hold the flags of the child sched + * domain for convenience. Clear such flags since + * the child is being destroyed. + */ + do { + sg->flags = 0; + } while (sg != sd->groups); + sd->child = NULL; + } } for (tmp = sd; tmp; tmp = tmp->parent) @@ -916,10 +928,12 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) return NULL; sg_span = sched_group_span(sg); - if (sd->child) + if (sd->child) { cpumask_copy(sg_span, sched_domain_span(sd->child)); - else + sg->flags = sd->child->flags; + } else { cpumask_copy(sg_span, sched_domain_span(sd)); + } atomic_inc(&sg->ref); return sg; @@ -1169,6 +1183,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) if (child) { cpumask_copy(sched_group_span(sg), sched_domain_span(child)); cpumask_copy(group_balance_mask(sg), sched_group_span(sg)); + sg->flags = child->flags; } else { cpumask_set_cpu(cpu, sched_group_span(sg)); cpumask_set_cpu(cpu, group_balance_mask(sg)); -- cgit From 6025643596895695956c27119c6b0bfa40d8035b Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 10 Sep 2021 18:18:16 -0700 Subject: sched/fair: Optimize checking for group_asym_packing sched_asmy_prefer() always returns false when called on the local group. By checking local_group, we can avoid additional checks and invoking sched_asmy_prefer() when it is not needed. No functional changes are introduced. Signed-off-by: Ricardo Neri Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Joel Fernandes (Google) Reviewed-by: Len Brown Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210911011819.12184-4-ricardo.neri-calderon@linux.intel.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 71b30ef31110..e050b1d92efe 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8631,7 +8631,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, } /* Check if dst CPU is idle and preferred to this group */ - if (env->sd->flags & SD_ASYM_PACKING && + if (!local_group && env->sd->flags & SD_ASYM_PACKING && env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running && sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) { -- cgit From c0d14b57fe0c11b65ce8a1a4a58a48f3f324ca0f Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 10 Sep 2021 18:18:17 -0700 Subject: sched/fair: Provide update_sg_lb_stats() with sched domain statistics Before deciding to pull tasks when using asymmetric packing of tasks, on some architectures (e.g., x86) it is necessary to know not only the state of dst_cpu but also of its SMT siblings. The decision to classify a candidate busiest group as group_asym_packing is done in update_sg_lb_stats(). Give this function access to the scheduling domain statistics, which contains the statistics of the local group. Originally-by: Peter Zijlstra (Intel) Signed-off-by: Ricardo Neri Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Joel Fernandes (Google) Reviewed-by: Len Brown Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210911011819.12184-5-ricardo.neri-calderon@linux.intel.com --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e050b1d92efe..2e8ef337c7d4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8579,6 +8579,7 @@ group_type group_classify(unsigned int imbalance_pct, * @sg_status: Holds flag indicating the status of the sched_group */ static inline void update_sg_lb_stats(struct lb_env *env, + struct sd_lb_stats *sds, struct sched_group *group, struct sg_lb_stats *sgs, int *sg_status) @@ -8587,7 +8588,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, memset(sgs, 0, sizeof(*sgs)); - local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group)); + local_group = group == sds->local; for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); @@ -9150,7 +9151,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_group_capacity(env->sd, env->dst_cpu); } - update_sg_lb_stats(env, sg, sgs, &sg_status); + update_sg_lb_stats(env, sds, sg, sgs, &sg_status); if (local_group) goto next_group; -- cgit From aafc917a3c31dcc76cb0279cd7617dda11b15f2a Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 10 Sep 2021 18:18:18 -0700 Subject: sched/fair: Carve out logic to mark a group for asymmetric packing Create a separate function, sched_asym(). A subsequent changeset will introduce logic to deal with SMT in conjunction with asmymmetric packing. Such logic will need the statistics of the scheduling group provided as argument. Update them before calling sched_asym(). Co-developed-by: Peter Zijlstra (Intel) Signed-off-by: Ricardo Neri Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Joel Fernandes (Google) Reviewed-by: Len Brown Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210911011819.12184-6-ricardo.neri-calderon@linux.intel.com --- kernel/sched/fair.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2e8ef337c7d4..1c8b5fa58c23 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8571,6 +8571,13 @@ group_type group_classify(unsigned int imbalance_pct, return group_has_spare; } +static inline bool +sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, + struct sched_group *group) +{ + return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu); +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. @@ -8631,18 +8638,17 @@ static inline void update_sg_lb_stats(struct lb_env *env, } } + sgs->group_capacity = group->sgc->capacity; + + sgs->group_weight = group->group_weight; + /* Check if dst CPU is idle and preferred to this group */ if (!local_group && env->sd->flags & SD_ASYM_PACKING && - env->idle != CPU_NOT_IDLE && - sgs->sum_h_nr_running && - sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) { + env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running && + sched_asym(env, sds, sgs, group)) { sgs->group_asym_packing = 1; } - sgs->group_capacity = group->sgc->capacity; - - sgs->group_weight = group->group_weight; - sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); /* Computing avg_load makes sense only when group is overloaded */ -- cgit From 4006a72bdd93b1ffedc2bd8646dee18c822a2c26 Mon Sep 17 00:00:00 2001 From: Ricardo Neri Date: Fri, 10 Sep 2021 18:18:19 -0700 Subject: sched/fair: Consider SMT in ASYM_PACKING load balance When deciding to pull tasks in ASYM_PACKING, it is necessary not only to check for the idle state of the destination CPU, dst_cpu, but also of its SMT siblings. If dst_cpu is idle but its SMT siblings are busy, performance suffers if it pulls tasks from a medium priority CPU that does not have SMT siblings. Implement asym_smt_can_pull_tasks() to inspect the state of the SMT siblings of both dst_cpu and the CPUs in the candidate busiest group. Signed-off-by: Ricardo Neri Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Joel Fernandes (Google) Reviewed-by: Len Brown Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210911011819.12184-7-ricardo.neri-calderon@linux.intel.com --- kernel/sched/fair.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1c8b5fa58c23..c50ae23013dc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8571,10 +8571,96 @@ group_type group_classify(unsigned int imbalance_pct, return group_has_spare; } +/** + * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks + * @dst_cpu: Destination CPU of the load balancing + * @sds: Load-balancing data with statistics of the local group + * @sgs: Load-balancing statistics of the candidate busiest group + * @sg: The candidate busiest group + * + * Check the state of the SMT siblings of both @sds::local and @sg and decide + * if @dst_cpu can pull tasks. + * + * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of + * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks + * only if @dst_cpu has higher priority. + * + * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more + * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority. + * Bigger imbalances in the number of busy CPUs will be dealt with in + * update_sd_pick_busiest(). + * + * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings + * of @dst_cpu are idle and @sg has lower priority. + */ +static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, + struct sg_lb_stats *sgs, + struct sched_group *sg) +{ +#ifdef CONFIG_SCHED_SMT + bool local_is_smt, sg_is_smt; + int sg_busy_cpus; + + local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY; + sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY; + + sg_busy_cpus = sgs->group_weight - sgs->idle_cpus; + + if (!local_is_smt) { + /* + * If we are here, @dst_cpu is idle and does not have SMT + * siblings. Pull tasks if candidate group has two or more + * busy CPUs. + */ + if (sg_busy_cpus >= 2) /* implies sg_is_smt */ + return true; + + /* + * @dst_cpu does not have SMT siblings. @sg may have SMT + * siblings and only one is busy. In such case, @dst_cpu + * can help if it has higher priority and is idle (i.e., + * it has no running tasks). + */ + return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); + } + + /* @dst_cpu has SMT siblings. */ + + if (sg_is_smt) { + int local_busy_cpus = sds->local->group_weight - + sds->local_stat.idle_cpus; + int busy_cpus_delta = sg_busy_cpus - local_busy_cpus; + + if (busy_cpus_delta == 1) + return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); + + return false; + } + + /* + * @sg does not have SMT siblings. Ensure that @sds::local does not end + * up with more than one busy SMT sibling and only pull tasks if there + * are not busy CPUs (i.e., no CPU has running tasks). + */ + if (!sds->local_stat.sum_nr_running) + return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); + + return false; +#else + /* Always return false so that callers deal with non-SMT cases. */ + return false; +#endif +} + static inline bool sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, struct sched_group *group) { + /* Only do SMT checks if either local or candidate have SMT siblings */ + if ((sds->local->flags & SD_SHARE_CPUCAPACITY) || + (group->flags & SD_SHARE_CPUCAPACITY)) + return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group); + return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu); } @@ -9580,6 +9666,12 @@ static struct rq *find_busiest_queue(struct lb_env *env, nr_running == 1) continue; + /* Make sure we only pull tasks from a CPU of lower priority */ + if ((env->sd->flags & SD_ASYM_PACKING) && + sched_asym_prefer(i, env->dst_cpu) && + nr_running == 1) + continue; + switch (env->migration_type) { case migrate_load: /* -- cgit From d07b2eee4501c393cbf5bfcad36143310cfd72f9 Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Wed, 22 Sep 2021 16:57:35 +0800 Subject: sched: Make cookie functions static Make cookie functions static as these are no longer invoked directly by other code. No functional change intended. Signed-off-by: Shaokun Zhang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210922085735.52812-1-zhangshaokun@hisilicon.com --- kernel/sched/core_sched.c | 9 +++++---- kernel/sched/sched.h | 5 ----- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 9a80e9a474c0..48ac72696012 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -11,7 +11,7 @@ struct sched_core_cookie { refcount_t refcnt; }; -unsigned long sched_core_alloc_cookie(void) +static unsigned long sched_core_alloc_cookie(void) { struct sched_core_cookie *ck = kmalloc(sizeof(*ck), GFP_KERNEL); if (!ck) @@ -23,7 +23,7 @@ unsigned long sched_core_alloc_cookie(void) return (unsigned long)ck; } -void sched_core_put_cookie(unsigned long cookie) +static void sched_core_put_cookie(unsigned long cookie) { struct sched_core_cookie *ptr = (void *)cookie; @@ -33,7 +33,7 @@ void sched_core_put_cookie(unsigned long cookie) } } -unsigned long sched_core_get_cookie(unsigned long cookie) +static unsigned long sched_core_get_cookie(unsigned long cookie) { struct sched_core_cookie *ptr = (void *)cookie; @@ -53,7 +53,8 @@ unsigned long sched_core_get_cookie(unsigned long cookie) * * Returns: the old cookie */ -unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie) +static unsigned long sched_core_update_cookie(struct task_struct *p, + unsigned long cookie) { unsigned long old_cookie; struct rq_flags rf; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e5c4d4ddd83a..a00fc7057d97 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1258,11 +1258,6 @@ extern void sched_core_dequeue(struct rq *rq, struct task_struct *p); extern void sched_core_get(void); extern void sched_core_put(void); -extern unsigned long sched_core_alloc_cookie(void); -extern void sched_core_put_cookie(unsigned long cookie); -extern unsigned long sched_core_get_cookie(unsigned long cookie); -extern unsigned long sched_core_update_cookie(struct task_struct *p, unsigned long cookie); - #else /* !CONFIG_SCHED_CORE */ static inline bool sched_core_enabled(struct rq *rq) -- cgit From 8d491de6edc27138806cae6e8eca455beb325b62 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2021 14:24:32 +0200 Subject: sched: Move mmdrop to RCU on RT mmdrop() is invoked from finish_task_switch() by the incoming task to drop the mm which was handed over by the previous task. mmdrop() can be quite expensive which prevents an incoming real-time task from getting useful work done. Provide mmdrop_sched() which maps to mmdrop() on !RT kernels. On RT kernels it delagates the eventually required invocation of __mmdrop() to RCU. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210928122411.648582026@linutronix.de --- include/linux/mm_types.h | 4 ++++ include/linux/sched/mm.h | 29 +++++++++++++++++++++++++++++ kernel/sched/core.c | 2 +- 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7f8ee09c711f..e9672de22cf2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -572,6 +573,9 @@ struct mm_struct { bool tlb_flush_batched; #endif struct uprobes_state uprobes_state; +#ifdef CONFIG_PREEMPT_RT + struct rcu_head delayed_drop; +#endif #ifdef CONFIG_HUGETLB_PAGE atomic_long_t hugetlb_usage; #endif diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 5561486fddef..aca874d33fe6 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -49,6 +49,35 @@ static inline void mmdrop(struct mm_struct *mm) __mmdrop(mm); } +#ifdef CONFIG_PREEMPT_RT +/* + * RCU callback for delayed mm drop. Not strictly RCU, but call_rcu() is + * by far the least expensive way to do that. + */ +static inline void __mmdrop_delayed(struct rcu_head *rhp) +{ + struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop); + + __mmdrop(mm); +} + +/* + * Invoked from finish_task_switch(). Delegates the heavy lifting on RT + * kernels via RCU. + */ +static inline void mmdrop_sched(struct mm_struct *mm) +{ + /* Provides a full memory barrier. See mmdrop() */ + if (atomic_dec_and_test(&mm->mm_count)) + call_rcu(&mm->delayed_drop, __mmdrop_delayed); +} +#else +static inline void mmdrop_sched(struct mm_struct *mm) +{ + mmdrop(mm); +} +#endif + /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 95f4e16f98a2..9eaeba671001 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4836,7 +4836,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) */ if (mm) { membarrier_mm_sync_core_before_usermode(mm); - mmdrop(mm); + mmdrop_sched(mm); } if (unlikely(prev_state == TASK_DEAD)) { if (prev->sched_class->task_dead) -- cgit From 691925f3ddccea832cf2d162dc277d2623a816e3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2021 14:24:25 +0200 Subject: sched: Limit the number of task migrations per batch on RT Batched task migrations are a source for large latencies as they keep the scheduler from running while processing the migrations. Limit the batch size to 8 instead of 32 when running on a RT enabled kernel. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210928122411.425097596@linutronix.de --- kernel/sched/core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9eaeba671001..749284fc3c1f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -74,7 +74,11 @@ __read_mostly int sysctl_resched_latency_warn_once = 1; * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ +#ifdef CONFIG_PREEMPT_RT +const_debug unsigned int sysctl_sched_nr_migrate = 8; +#else const_debug unsigned int sysctl_sched_nr_migrate = 32; +#endif /* * period over which we measure -rt task CPU usage in us. -- cgit From 539fbb5be0da56ffa1434b4f56521a0522bd1d61 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2021 14:24:27 +0200 Subject: sched: Disable TTWU_QUEUE on RT The queued remote wakeup mechanism has turned out to be suboptimal for RT enabled kernels. The maximum latencies go up by a factor of > 5x in certain scenarious. This is caused by either long wake lists or by a large number of TTWU IPIs which are processed back to back. Disable it for RT. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210928122411.482262764@linutronix.de --- kernel/sched/features.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 7f8dace0964c..1cf435bbcd9c 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -46,11 +46,16 @@ SCHED_FEAT(DOUBLE_TICK, false) */ SCHED_FEAT(NONTASK_CAPACITY, true) +#ifdef CONFIG_PREEMPT_RT +SCHED_FEAT(TTWU_QUEUE, false) +#else + /* * Queue remote wakeups on the target CPU and process them * using the scheduler IPI. Reduces rq->lock contention/bounces. */ SCHED_FEAT(TTWU_QUEUE, true) +#endif /* * When doing wakeups, attempt to limit superfluous scans of the LLC domain. -- cgit From 670721c7bd2a6e16e40db29b2707a27bdecd6928 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2021 14:24:28 +0200 Subject: sched: Move kprobes cleanup out of finish_task_switch() Doing cleanups in the tail of schedule() is a latency punishment for the incoming task. The point of invoking kprobes_task_flush() for a dead task is that the instances are returned and cannot leak when __schedule() is kprobed. Move it into the delayed cleanup. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210928122411.537994026@linutronix.de --- kernel/exit.c | 2 ++ kernel/kprobes.c | 8 ++++---- kernel/sched/core.c | 6 ------ 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 91a43e57a32e..63851320ae73 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -64,6 +64,7 @@ #include #include #include +#include #include #include @@ -168,6 +169,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + kprobe_flush_task(tsk); perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 790a573bbe00..9a38e7581a5c 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1250,10 +1250,10 @@ void kprobe_busy_end(void) } /* - * This function is called from finish_task_switch when task tk becomes dead, - * so that we can recycle any function-return probe instances associated - * with this task. These left over instances represent probed functions - * that have been called but will never return. + * This function is called from delayed_put_task_struct() when a task is + * dead and cleaned up to recycle any function-return probe instances + * associated with this task. These left over instances represent probed + * functions that have been called but will never return. */ void kprobe_flush_task(struct task_struct *tk) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 749284fc3c1f..e33b03c2bb9a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4846,12 +4846,6 @@ static struct rq *finish_task_switch(struct task_struct *prev) if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); - /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. - */ - kprobe_flush_task(prev); - /* Task is done with its stack. */ put_task_stack(prev); -- cgit From b945efcdd07d86cece1cce68503aae91f107eacb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 29 Sep 2021 11:37:32 +0200 Subject: sched: Remove pointless preemption disable in sched_submit_work() Neither wq_worker_sleeping() nor io_wq_worker_sleeping() require to be invoked with preemption disabled: - The worker flag checks operations only need to be serialized against the worker thread itself. - The accounting and worker pool operations are serialized with locks. which means that disabling preemption has neither a reason nor a value. Remove it and update the stale comment. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lai Jiangshan Reviewed-by: Jens Axboe Link: https://lkml.kernel.org/r/8735pnafj7.ffs@tglx --- kernel/sched/core.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e33b03c2bb9a..e47d7e55c46a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6251,20 +6251,14 @@ static inline void sched_submit_work(struct task_struct *tsk) task_flags = tsk->flags; /* - * If a worker went to sleep, notify and ask workqueue whether - * it wants to wake up a task to maintain concurrency. - * As this function is called inside the schedule() context, - * we disable preemption to avoid it calling schedule() again - * in the possible wakeup of a kworker and because wq_worker_sleeping() - * requires it. + * If a worker goes to sleep, notify and ask workqueue whether it + * wants to wake up a task to maintain concurrency. */ if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) { - preempt_disable(); if (task_flags & PF_WQ_WORKER) wq_worker_sleeping(tsk); else io_wq_worker_sleeping(tsk); - preempt_enable_no_resched(); } if (tsk_is_pi_blocked(tsk)) -- cgit From a7ba894821b6ade7bb420455f87020b2838d6180 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 28 Sep 2021 12:35:44 +0200 Subject: sched/fair: Removed useless update of p->recent_used_cpu Since commit 89aafd67f28c ("sched/fair: Use prev instead of new target as recent_used_cpu"), p->recent_used_cpu is unconditionnaly set with prev. Fixes: 89aafd67f28c ("sched/fair: Use prev instead of new target as recent_used_cpu") Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Link: https://lkml.kernel.org/r/20210928103544.27489-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c50ae23013dc..2468d1d1edef 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6413,11 +6413,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && asym_fits_capacity(task_util, recent_used_cpu)) { - /* - * Replace recent_used_cpu with prev as it is a potential - * candidate for the next wake: - */ - p->recent_used_cpu = prev; return recent_used_cpu; } -- cgit From 769fdf83df57b373660343ef4270b3ada91ef434 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Oct 2021 10:12:05 +0200 Subject: sched: Fix DEBUG && !SCHEDSTATS warn When !SCHEDSTATS schedstat_enabled() is an unconditional 0 and the whole block doesn't exist, however GCC figures the scoped variable 'stats' is unused and complains about it. Upgrade the warning from -Wunused-variable to -Wunused-but-set-variable by writing it in two statements. This fixes the build because the new warning is in W=1. Given that whole if(0) {} thing, I don't feel motivated to change things overly much and quite strongly feel this is the compiler being daft. Fixes: cb3e971c435d ("sched: Make struct sched_statistics independent of fair sched class") Reported-by: Stephen Rothwell Signed-off-by: Peter Zijlstra (Intel) --- kernel/sched/debug.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 26fac5e28bc0..7dcbaa31c5d9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -463,7 +463,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group PN(se->sum_exec_runtime); if (schedstat_enabled()) { - struct sched_statistics *stats = __schedstats_from_se(se); + struct sched_statistics *stats; + stats = __schedstats_from_se(se); PN_SCHEDSTAT(wait_start); PN_SCHEDSTAT(sleep_start); -- cgit From f6ac18fafcf6cc5e41c26766d12ad335ed81012e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Sep 2021 10:14:15 +0200 Subject: sched: Improve try_invoke_on_locked_down_task() Clarify and tighten try_invoke_on_locked_down_task(). Basically the function calls @func under task_rq_lock(), except it avoids taking rq->lock when possible. This makes calling @func unconditional (the function will get renamed in a later patch to remove the try). Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vasily Gorbik Tested-by: Vasily Gorbik # on s390 Link: https://lkml.kernel.org/r/20210929152428.589323576@infradead.org --- kernel/sched/core.c | 63 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 24 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e47d7e55c46a..8a9aeac3469f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4115,41 +4115,56 @@ out: * @func: Function to invoke. * @arg: Argument to function. * - * If the specified task can be quickly locked into a definite state - * (either sleeping or on a given runqueue), arrange to keep it in that - * state while invoking @func(@arg). This function can use ->on_rq and - * task_curr() to work out what the state is, if required. Given that - * @func can be invoked with a runqueue lock held, it had better be quite - * lightweight. + * Fix the task in it's current state by avoiding wakeups and or rq operations + * and call @func(@arg) on it. This function can use ->on_rq and task_curr() + * to work out what the state is, if required. Given that @func can be invoked + * with a runqueue lock held, it had better be quite lightweight. * * Returns: - * @false if the task slipped out from under the locks. - * @true if the task was locked onto a runqueue or is sleeping. - * However, @func can override this by returning @false. + * Whatever @func returns */ bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) { + struct rq *rq = NULL; + unsigned int state; struct rq_flags rf; bool ret = false; - struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); - if (p->on_rq) { + + state = READ_ONCE(p->__state); + + /* + * Ensure we load p->on_rq after p->__state, otherwise it would be + * possible to, falsely, observe p->on_rq == 0. + * + * See try_to_wake_up() for a longer comment. + */ + smp_rmb(); + + /* + * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when + * the task is blocked. Make sure to check @state since ttwu() can drop + * locks at the end, see ttwu_queue_wakelist(). + */ + if (state == TASK_RUNNING || state == TASK_WAKING || p->on_rq) rq = __task_rq_lock(p, &rf); - if (task_rq(p) == rq) - ret = func(p, arg); + + /* + * At this point the task is pinned; either: + * - blocked and we're holding off wakeups (pi->lock) + * - woken, and we're holding off enqueue (rq->lock) + * - queued, and we're holding off schedule (rq->lock) + * - running, and we're holding off de-schedule (rq->lock) + * + * The called function (@func) can use: task_curr(), p->on_rq and + * p->__state to differentiate between these states. + */ + ret = func(p, arg); + + if (rq) rq_unlock(rq, &rf); - } else { - switch (READ_ONCE(p->__state)) { - case TASK_RUNNING: - case TASK_WAKING: - break; - default: - smp_rmb(); // See smp_rmb() comment in try_to_wake_up(). - if (!p->on_rq) - ret = func(p, arg); - } - } + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); return ret; } -- cgit From 9b3c4ab3045e953670c7de9c1165fae5358a7237 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Sep 2021 21:54:32 +0200 Subject: sched,rcu: Rework try_invoke_on_locked_down_task() Give try_invoke_on_locked_down_task() a saner name and have it return an int so that the caller might distinguish between different reasons of failure. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Acked-by: Vasily Gorbik Tested-by: Vasily Gorbik # on s390 Link: https://lkml.kernel.org/r/20210929152428.649944917@infradead.org --- include/linux/wait.h | 3 ++- kernel/rcu/tasks.h | 12 ++++++------ kernel/rcu/tree_stall.h | 8 ++++---- kernel/sched/core.c | 6 +++--- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/include/linux/wait.h b/include/linux/wait.h index 93dab0e9580f..2d0df57c9902 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -1160,6 +1160,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i (wait)->flags = 0; \ } while (0) -bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg); +typedef int (*task_call_f)(struct task_struct *p, void *arg); +extern int task_call_func(struct task_struct *p, task_call_f func, void *arg); #endif /* _LINUX_WAIT_H */ diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 806160c44b17..171bc848e8e3 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -928,7 +928,7 @@ reset_ipi: } /* Callback function for scheduler to check locked-down task. */ -static bool trc_inspect_reader(struct task_struct *t, void *arg) +static int trc_inspect_reader(struct task_struct *t, void *arg) { int cpu = task_cpu(t); bool in_qs = false; @@ -939,7 +939,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) // If no chance of heavyweight readers, do it the hard way. if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) - return false; + return -EINVAL; // If heavyweight readers are enabled on the remote task, // we can inspect its state despite its currently running. @@ -947,7 +947,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) n_heavy_reader_attempts++; if (!ofl && // Check for "running" idle tasks on offline CPUs. !rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) - return false; // No quiescent state, do it the hard way. + return -EINVAL; // No quiescent state, do it the hard way. n_heavy_reader_updates++; if (ofl) n_heavy_reader_ofl_updates++; @@ -962,7 +962,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) t->trc_reader_checked = true; if (in_qs) - return true; // Already in quiescent state, done!!! + return 0; // Already in quiescent state, done!!! // The task is in a read-side critical section, so set up its // state so that it will awaken the grace-period kthread upon exit @@ -970,7 +970,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) atomic_inc(&trc_n_readers_need_end); // One more to wait on. WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)); WRITE_ONCE(t->trc_reader_special.b.need_qs, true); - return true; + return 0; } /* Attempt to extract the state for the specified task. */ @@ -992,7 +992,7 @@ static void trc_wait_for_one_reader(struct task_struct *t, // Attempt to nail down the task for inspection. get_task_struct(t); - if (try_invoke_on_locked_down_task(t, trc_inspect_reader, NULL)) { + if (!task_call_func(t, trc_inspect_reader, NULL)) { put_task_struct(t); return; } diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 677ee3d8671b..5e2fa6fd97f1 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -240,16 +240,16 @@ struct rcu_stall_chk_rdr { * Report out the state of a not-running task that is stalling the * current RCU grace period. */ -static bool check_slow_task(struct task_struct *t, void *arg) +static int check_slow_task(struct task_struct *t, void *arg) { struct rcu_stall_chk_rdr *rscrp = arg; if (task_curr(t)) - return false; // It is running, so decline to inspect it. + return -EBUSY; // It is running, so decline to inspect it. rscrp->nesting = t->rcu_read_lock_nesting; rscrp->rs = t->rcu_read_unlock_special; rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry); - return true; + return 0; } /* @@ -283,7 +283,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); while (i) { t = ts[--i]; - if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr)) + if (task_call_func(t, check_slow_task, &rscr)) pr_cont(" P%d", t->pid); else pr_cont(" P%d/%d:%c%c%c%c", diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8a9aeac3469f..74db3c3afa6d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4110,7 +4110,7 @@ out: } /** - * try_invoke_on_locked_down_task - Invoke a function on task in fixed state + * task_call_func - Invoke a function on task in fixed state * @p: Process for which the function is to be invoked, can be @current. * @func: Function to invoke. * @arg: Argument to function. @@ -4123,12 +4123,12 @@ out: * Returns: * Whatever @func returns */ -bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) +int task_call_func(struct task_struct *p, task_call_f func, void *arg) { struct rq *rq = NULL; unsigned int state; struct rq_flags rf; - bool ret = false; + int ret; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); -- cgit From 00619f7c650e4e46c650cb2e2fd5f438b32dc64b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Sep 2021 21:54:32 +0200 Subject: sched,livepatch: Use task_call_func() Instead of frobbing around with scheduler internals, use the shiny new task_call_func() interface. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Petr Mladek Acked-by: Miroslav Benes Acked-by: Vasily Gorbik Tested-by: Petr Mladek Tested-by: Vasily Gorbik # on s390 Link: https://lkml.kernel.org/r/20210929152428.709906138@infradead.org --- kernel/livepatch/transition.c | 90 +++++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 46 deletions(-) diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 291b857a6e20..75251e9dbc3c 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -13,7 +13,6 @@ #include "core.h" #include "patch.h" #include "transition.h" -#include "../sched/sched.h" #define MAX_STACK_ENTRIES 100 #define STACK_ERR_BUF_SIZE 128 @@ -240,7 +239,7 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries, * Determine whether it's safe to transition the task to the target patch state * by looking for any to-be-patched or to-be-unpatched functions on its stack. */ -static int klp_check_stack(struct task_struct *task, char *err_buf) +static int klp_check_stack(struct task_struct *task, const char **oldname) { static unsigned long entries[MAX_STACK_ENTRIES]; struct klp_object *obj; @@ -248,12 +247,8 @@ static int klp_check_stack(struct task_struct *task, char *err_buf) int ret, nr_entries; ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries)); - if (ret < 0) { - snprintf(err_buf, STACK_ERR_BUF_SIZE, - "%s: %s:%d has an unreliable stack\n", - __func__, task->comm, task->pid); - return ret; - } + if (ret < 0) + return -EINVAL; nr_entries = ret; klp_for_each_object(klp_transition_patch, obj) { @@ -262,11 +257,8 @@ static int klp_check_stack(struct task_struct *task, char *err_buf) klp_for_each_func(obj, func) { ret = klp_check_stack_func(func, entries, nr_entries); if (ret) { - snprintf(err_buf, STACK_ERR_BUF_SIZE, - "%s: %s:%d is sleeping on function %s\n", - __func__, task->comm, task->pid, - func->old_name); - return ret; + *oldname = func->old_name; + return -EADDRINUSE; } } } @@ -274,6 +266,22 @@ static int klp_check_stack(struct task_struct *task, char *err_buf) return 0; } +static int klp_check_and_switch_task(struct task_struct *task, void *arg) +{ + int ret; + + if (task_curr(task) && task != current) + return -EBUSY; + + ret = klp_check_stack(task, arg); + if (ret) + return ret; + + clear_tsk_thread_flag(task, TIF_PATCH_PENDING); + task->patch_state = klp_target_state; + return 0; +} + /* * Try to safely switch a task to the target patch state. If it's currently * running, or it's sleeping on a to-be-patched or to-be-unpatched function, or @@ -281,13 +289,8 @@ static int klp_check_stack(struct task_struct *task, char *err_buf) */ static bool klp_try_switch_task(struct task_struct *task) { - static char err_buf[STACK_ERR_BUF_SIZE]; - struct rq *rq; - struct rq_flags flags; + const char *old_name; int ret; - bool success = false; - - err_buf[0] = '\0'; /* check if this task has already switched over */ if (task->patch_state == klp_target_state) @@ -305,36 +308,31 @@ static bool klp_try_switch_task(struct task_struct *task) * functions. If all goes well, switch the task to the target patch * state. */ - rq = task_rq_lock(task, &flags); + ret = task_call_func(task, klp_check_and_switch_task, &old_name); + switch (ret) { + case 0: /* success */ + break; - if (task_running(rq, task) && task != current) { - snprintf(err_buf, STACK_ERR_BUF_SIZE, - "%s: %s:%d is running\n", __func__, task->comm, - task->pid); - goto done; + case -EBUSY: /* klp_check_and_switch_task() */ + pr_debug("%s: %s:%d is running\n", + __func__, task->comm, task->pid); + break; + case -EINVAL: /* klp_check_and_switch_task() */ + pr_debug("%s: %s:%d has an unreliable stack\n", + __func__, task->comm, task->pid); + break; + case -EADDRINUSE: /* klp_check_and_switch_task() */ + pr_debug("%s: %s:%d is sleeping on function %s\n", + __func__, task->comm, task->pid, old_name); + break; + + default: + pr_debug("%s: Unknown error code (%d) when trying to switch %s:%d\n", + __func__, ret, task->comm, task->pid); + break; } - ret = klp_check_stack(task, err_buf); - if (ret) - goto done; - - success = true; - - clear_tsk_thread_flag(task, TIF_PATCH_PENDING); - task->patch_state = klp_target_state; - -done: - task_rq_unlock(rq, task, &flags); - - /* - * Due to console deadlock issues, pr_debug() can't be used while - * holding the task rq lock. Instead we have to use a temporary buffer - * and print the debug message after releasing the lock. - */ - if (err_buf[0] != '\0') - pr_debug("%s", err_buf); - - return success; + return !ret; } /* -- cgit From 8850cb663b5cda04d33f9cfbc38889d73d3c8e24 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Sep 2021 22:16:02 +0200 Subject: sched: Simplify wake_up_*idle*() Simplify and make wake_up_if_idle() more robust, also don't iterate the whole machine with preempt_disable() in it's caller: wake_up_all_idle_cpus(). This prepares for another wake_up_if_idle() user that needs a full do_idle() cycle. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vasily Gorbik Tested-by: Vasily Gorbik # on s390 Link: https://lkml.kernel.org/r/20210929152428.769328779@infradead.org --- kernel/sched/core.c | 14 +++++--------- kernel/smp.c | 6 +++--- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 74db3c3afa6d..3b55ef99c03f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3695,15 +3695,11 @@ void wake_up_if_idle(int cpu) if (!is_idle_task(rcu_dereference(rq->curr))) goto out; - if (set_nr_if_polling(rq->idle)) { - trace_sched_wake_idle_without_ipi(cpu); - } else { - rq_lock_irqsave(rq, &rf); - if (is_idle_task(rq->curr)) - smp_send_reschedule(cpu); - /* Else CPU is not idle, do nothing here: */ - rq_unlock_irqrestore(rq, &rf); - } + rq_lock_irqsave(rq, &rf); + if (is_idle_task(rq->curr)) + resched_curr(rq); + /* Else CPU is not idle, do nothing here: */ + rq_unlock_irqrestore(rq, &rf); out: rcu_read_unlock(); diff --git a/kernel/smp.c b/kernel/smp.c index f43ede0ab183..ad0b68a3a3d3 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -1170,14 +1170,14 @@ void wake_up_all_idle_cpus(void) { int cpu; - preempt_disable(); + cpus_read_lock(); for_each_online_cpu(cpu) { - if (cpu == smp_processor_id()) + if (cpu == raw_smp_processor_id()) continue; wake_up_if_idle(cpu); } - preempt_enable(); + cpus_read_unlock(); } EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); -- cgit From 5de62ea84abd732ded7c5569426fd71c0420f83e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Sep 2021 22:16:02 +0200 Subject: sched,livepatch: Use wake_up_if_idle() Make sure to prod idle CPUs so they call klp_update_patch_state(). Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Petr Mladek Acked-by: Miroslav Benes Acked-by: Vasily Gorbik Tested-by: Petr Mladek Tested-by: Vasily Gorbik # on s390 Link: https://lkml.kernel.org/r/20210929151723.162004989@infradead.org --- include/linux/sched/idle.h | 4 ++++ kernel/livepatch/transition.c | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h index 22873d276be6..d73d314d59c6 100644 --- a/include/linux/sched/idle.h +++ b/include/linux/sched/idle.h @@ -11,7 +11,11 @@ enum cpu_idle_type { CPU_MAX_IDLE_TYPES }; +#ifdef CONFIG_SMP extern void wake_up_if_idle(int cpu); +#else +static inline void wake_up_if_idle(int cpu) { } +#endif /* * Idle thread specific functions to determine the need_resched diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 75251e9dbc3c..5683ac0d2566 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -413,8 +413,11 @@ void klp_try_complete_transition(void) for_each_possible_cpu(cpu) { task = idle_task(cpu); if (cpu_online(cpu)) { - if (!klp_try_switch_task(task)) + if (!klp_try_switch_task(task)) { complete = false; + /* Make idle task go through the main loop. */ + wake_up_if_idle(cpu); + } } else if (task->patch_state != klp_target_state) { /* offline idle tasks can be switched immediately */ clear_tsk_thread_flag(task, TIF_PATCH_PENDING); -- cgit From 7a2341fc1fec0b8b3580be4226ea244756d3a1b3 Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Mon, 4 Oct 2021 16:27:03 +0530 Subject: sched/numa: Replace hard-coded number by a define in numa_task_group() While allocating group fault stats, task_numa_group() is using a hard coded number 4. Replace this by NR_NUMA_HINT_FAULT_STATS. No functionality change in this commit. Signed-off-by: Bharata B Rao Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Link: https://lkml.kernel.org/r/20211004105706.3669-2-bharata@amd.com --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2468d1d1edef..fc0a0ed63d15 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2438,7 +2438,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (unlikely(!deref_curr_numa_group(p))) { unsigned int size = sizeof(struct numa_group) + - 4*nr_node_ids*sizeof(unsigned long); + NR_NUMA_HINT_FAULT_STATS * + nr_node_ids * sizeof(unsigned long); grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); if (!grp) -- cgit From 5b763a14a5164e4c442e99d186fb39dac489e49b Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Mon, 4 Oct 2021 16:27:04 +0530 Subject: sched/numa: Remove the redundant member numa_group::fault_cpus numa_group::fault_cpus is actually a pointer to the region in numa_group::faults[] where NUMA_CPU stats are located. Remove this redundant member and use numa_group::faults[NUMA_CPU] directly like it is done for similar per-process numa fault stats. There is no functionality change due to this commit. Signed-off-by: Bharata B Rao Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Link: https://lkml.kernel.org/r/20211004105706.3669-3-bharata@amd.com --- kernel/sched/fair.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fc0a0ed63d15..cfbd5ef0b558 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1038,11 +1038,12 @@ struct numa_group { unsigned long total_faults; unsigned long max_faults_cpu; /* + * faults[] array is split into two regions: faults_mem and faults_cpu. + * * Faults_cpu is used to decide whether memory should move * towards the CPU. As a consequence, these stats are weighted * more by CPU use than by memory faults. */ - unsigned long *faults_cpu; unsigned long faults[]; }; @@ -1216,8 +1217,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) { - return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] + - group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; + return group->faults[task_faults_idx(NUMA_CPU, nid, 0)] + + group->faults[task_faults_idx(NUMA_CPU, nid, 1)]; } static inline unsigned long group_faults_priv(struct numa_group *ng) @@ -2384,7 +2385,7 @@ static void task_numa_placement(struct task_struct *p) * is at the beginning of the numa_faults array. */ ng->faults[mem_idx] += diff; - ng->faults_cpu[mem_idx] += f_diff; + ng->faults[cpu_idx] += f_diff; ng->total_faults += diff; group_faults += ng->faults[mem_idx]; } @@ -2450,9 +2451,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, grp->max_faults_cpu = 0; spin_lock_init(&grp->lock); grp->gid = p->pid; - /* Second half of the array tracks nids where faults happen */ - grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * - nr_node_ids; for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) grp->faults[i] = p->numa_faults[i]; -- cgit From 7d380f24fe662033fd21a65f678057abd293f76e Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Mon, 4 Oct 2021 16:27:05 +0530 Subject: sched/numa: Fix a few comments Fix a few comments to help understand them better. Signed-off-by: Bharata B Rao Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Link: https://lkml.kernel.org/r/20211004105706.3669-4-bharata@amd.com --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cfbd5ef0b558..87db481e8a56 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2074,7 +2074,7 @@ static void numa_migrate_preferred(struct task_struct *p) } /* - * Find out how many nodes on the workload is actively running on. Do this by + * Find out how many nodes the workload is actively running on. Do this by * tracking the nodes from which NUMA hinting faults are triggered. This can * be different from the set of nodes where the workload's memory is currently * located. @@ -2128,7 +2128,7 @@ static void update_task_scan_period(struct task_struct *p, /* * If there were no record hinting faults then either the task is - * completely idle or all activity is areas that are not of interest + * completely idle or all activity is in areas that are not of interest * to automatic numa balancing. Related to that, if there were failed * migration then it implies we are migrating too quickly or the local * node is overloaded. In either case, scan slower -- cgit From f9ec6fea201429b5a3f76319e943989f1a1e25ef Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Wed, 15 Sep 2021 14:31:58 +0800 Subject: sched/topology: Remove unused numa_distance in cpu_attach_domain() numa_distance in cpu_attach_domain() is introduced in commit b5b217346de8 ("sched/topology: Warn when NUMA diameter > 2") to warn user when NUMA diameter > 2 as we'll misrepresent the scheduler topology structures at that time. This is fixed by Barry in commit 585b6d2723dc ("sched/topology: fix the issue groups don't span domain->span for NUMA diameter > 2") and numa_distance is unused now. So remove it. Signed-off-by: Yicong Yang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Barry Song Reviewed-by: Valentin Schneider Link: https://lore.kernel.org/r/20210915063158.80639-1-yangyicong@hisilicon.com --- kernel/sched/topology.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index c56faae461d9..5af3edd34d6d 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -688,7 +688,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) { struct rq *rq = cpu_rq(cpu); struct sched_domain *tmp; - int numa_distance = 0; /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { @@ -732,9 +731,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) } } - for (tmp = sd; tmp; tmp = tmp->parent) - numa_distance += !!(tmp->flags & SD_NUMA); - sched_domain_debug(sd, cpu); rq_attach_root(rq, rd); -- cgit From 4ef0c5c6b5ba1f38f0ea1cedad0cad722f00c14a Mon Sep 17 00:00:00 2001 From: Zhang Qiao Date: Wed, 15 Sep 2021 14:40:30 +0800 Subject: kernel/sched: Fix sched_fork() access an invalid sched_task_group There is a small race between copy_process() and sched_fork() where child->sched_task_group point to an already freed pointer. parent doing fork() | someone moving the parent | to another cgroup -------------------------------+------------------------------- copy_process() + dup_task_struct()<1> parent move to another cgroup, and free the old cgroup. <2> + sched_fork() + __set_task_cpu()<3> + task_fork_fair() + sched_slice()<4> In the worst case, this bug can lead to "use-after-free" and cause panic as shown above: (1) parent copy its sched_task_group to child at <1>; (2) someone move the parent to another cgroup and free the old cgroup at <2>; (3) the sched_task_group and cfs_rq that belong to the old cgroup will be accessed at <3> and <4>, which cause a panic: [] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 [] PGD 8000001fa0a86067 P4D 8000001fa0a86067 PUD 2029955067 PMD 0 [] Oops: 0000 [#1] SMP PTI [] CPU: 7 PID: 648398 Comm: ebizzy Kdump: loaded Tainted: G OE --------- - - 4.18.0.x86_64+ #1 [] RIP: 0010:sched_slice+0x84/0xc0 [] Call Trace: [] task_fork_fair+0x81/0x120 [] sched_fork+0x132/0x240 [] copy_process.part.5+0x675/0x20e0 [] ? __handle_mm_fault+0x63f/0x690 [] _do_fork+0xcd/0x3b0 [] do_syscall_64+0x5d/0x1d0 [] entry_SYSCALL_64_after_hwframe+0x65/0xca [] RIP: 0033:0x7f04418cd7e1 Between cgroup_can_fork() and cgroup_post_fork(), the cgroup membership and thus sched_task_group can't change. So update child's sched_task_group at sched_post_fork() and move task_fork() and __set_task_cpu() (where accees the sched_task_group) from sched_fork() to sched_post_fork(). Fixes: 8323f26ce342 ("sched: Fix race in task_group") Signed-off-by: Zhang Qiao Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Link: https://lkml.kernel.org/r/20210915064030.2231-1-zhangqiao22@huawei.com --- include/linux/sched/task.h | 3 ++- kernel/fork.c | 2 +- kernel/sched/core.c | 43 ++++++++++++++++++++++--------------------- 3 files changed, 25 insertions(+), 23 deletions(-) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ef02be869cf2..ba88a6987400 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -54,7 +54,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); -extern void sched_post_fork(struct task_struct *p); +extern void sched_post_fork(struct task_struct *p, + struct kernel_clone_args *kargs); extern void sched_dead(struct task_struct *p); void __noreturn do_task_dead(void); diff --git a/kernel/fork.c b/kernel/fork.c index 38681ad44c76..0e4251dc5436 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2405,7 +2405,7 @@ static __latent_entropy struct task_struct *copy_process( write_unlock_irq(&tasklist_lock); proc_fork_connector(p); - sched_post_fork(p); + sched_post_fork(p, args); cgroup_post_fork(p, args); perf_event_fork(p); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3b55ef99c03f..935c2da00339 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4343,8 +4343,6 @@ int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, */ int sched_fork(unsigned long clone_flags, struct task_struct *p) { - unsigned long flags; - __sched_fork(clone_flags, p); /* * We mark the process as NEW here. This guarantees that @@ -4390,24 +4388,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) init_entity_runnable_average(&p->se); - /* - * The child is not yet in the pid-hash so no cgroup attach races, - * and the cgroup is pinned to this child due to cgroup_fork() - * is ran before sched_fork(). - * - * Silence PROVE_RCU. - */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - rseq_migrate(p); - /* - * We're setting the CPU for the first time, we don't migrate, - * so use __set_task_cpu(). - */ - __set_task_cpu(p, smp_processor_id()); - if (p->sched_class->task_fork) - p->sched_class->task_fork(p); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - #ifdef CONFIG_SCHED_INFO if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -4423,8 +4403,29 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } -void sched_post_fork(struct task_struct *p) +void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) { + unsigned long flags; +#ifdef CONFIG_CGROUP_SCHED + struct task_group *tg; +#endif + + raw_spin_lock_irqsave(&p->pi_lock, flags); +#ifdef CONFIG_CGROUP_SCHED + tg = container_of(kargs->cset->subsys[cpu_cgrp_id], + struct task_group, css); + p->sched_task_group = autogroup_task_group(p, tg); +#endif + rseq_migrate(p); + /* + * We're setting the CPU for the first time, we don't migrate, + * so use __set_task_cpu(). + */ + __set_task_cpu(p, smp_processor_id()); + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + uclamp_post_fork(p); } -- cgit From 804bccba71a57e7e5deb507a4c8ebbab730909c0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Sep 2021 19:54:50 -0700 Subject: sched: Fill unconditional hole induced by sched_entity With struct sched_entity before the other sched entities, its alignment won't induce a struct hole. This saves 64 bytes in defconfig task_struct: Before: ... unsigned int rt_priority; /* 120 4 */ /* XXX 4 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ const struct sched_class * sched_class; /* 128 8 */ /* XXX 56 bytes hole, try to pack */ /* --- cacheline 3 boundary (192 bytes) --- */ struct sched_entity se __attribute__((__aligned__(64))); /* 192 448 */ /* --- cacheline 10 boundary (640 bytes) --- */ struct sched_rt_entity rt; /* 640 48 */ struct sched_dl_entity dl __attribute__((__aligned__(8))); /* 688 224 */ /* --- cacheline 14 boundary (896 bytes) was 16 bytes ago --- */ After: ... unsigned int rt_priority; /* 120 4 */ /* XXX 4 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ struct sched_entity se __attribute__((__aligned__(64))); /* 128 448 */ /* --- cacheline 9 boundary (576 bytes) --- */ struct sched_rt_entity rt; /* 576 48 */ struct sched_dl_entity dl __attribute__((__aligned__(8))); /* 624 224 */ /* --- cacheline 13 boundary (832 bytes) was 16 bytes ago --- */ Summary diff: - /* size: 7040, cachelines: 110, members: 188 */ + /* size: 6976, cachelines: 109, members: 188 */ Signed-off-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210924025450.4138503-1-keescook@chromium.org --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 193e16e2d0e4..343603f77f8b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -775,10 +775,10 @@ struct task_struct { int normal_prio; unsigned int rt_priority; - const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; struct sched_dl_entity dl; + const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE struct rb_node core_node; -- cgit From 54354c6a9f7fd5572d2b9ec108117c4f376d4d23 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 29 Sep 2021 15:02:13 -0700 Subject: Revert "proc/wchan: use printk format instead of lookup_symbol_name()" This reverts commit 152c432b128cb043fc107e8f211195fe94b2159c. When a kernel address couldn't be symbolized for /proc/$pid/wchan, it would leak the raw value, a potential information exposure. This is a regression compared to the safer pre-v5.12 behavior. Reported-by: kernel test robot Reported-by: Vito Caputo Reported-by: Jann Horn Signed-off-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20211008111626.090829198@infradead.org --- fs/proc/base.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 533d5836eb9a..1f394095eb88 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #include #include @@ -386,17 +387,19 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long wchan; + char symname[KSYM_NAME_LEN]; - if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) - wchan = get_wchan(task); - else - wchan = 0; + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) + goto print0; - if (wchan) - seq_printf(m, "%ps", (void *) wchan); - else - seq_putc(m, '0'); + wchan = get_wchan(task); + if (wchan && !lookup_symbol_name(wchan, symname)) { + seq_puts(m, symname); + return 0; + } +print0: + seq_putc(m, '0'); return 0; } #endif /* CONFIG_KALLSYMS */ -- cgit From cf2a85efdade117e2169d6e26641016cbbf03ef0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 29 Sep 2021 15:02:18 -0700 Subject: leaking_addresses: Always print a trailing newline For files that lack trailing newlines and match a leaking address (e.g. wchan[1]), the leaking_addresses.pl report would run together with the next line, making things look corrupted. Unconditionally remove the newline on input, and write it back out on output. [1] https://lore.kernel.org/all/20210103142726.GC30643@xsang-OptiPlex-9020/ Signed-off-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211008111626.151570317@infradead.org --- scripts/leaking_addresses.pl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/leaking_addresses.pl b/scripts/leaking_addresses.pl index b2d8b8aa2d99..8f636a23bc3f 100755 --- a/scripts/leaking_addresses.pl +++ b/scripts/leaking_addresses.pl @@ -455,8 +455,9 @@ sub parse_file open my $fh, "<", $file or return; while ( <$fh> ) { + chomp; if (may_leak_address($_)) { - print $file . ': ' . $_; + printf("$file: $_\n"); } } close $fh; -- cgit From 4e046156792c26bef8a4e30be711777fc8578257 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 29 Sep 2021 15:02:15 -0700 Subject: proc: Use task_is_running() for wchan in /proc/$pid/stat The implementations of get_wchan() can be expensive. The only information imparted here is whether or not a process is currently blocked in the scheduler (and even this doesn't need to be exact). Avoid doing the heavy lifting of stack walking and just report that information by using task_is_running(). Signed-off-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211008111626.211281780@infradead.org --- fs/proc/array.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 49be8c8ef555..77cf4187adec 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -541,7 +541,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, } if (permitted && (!whole || num_threads < 2)) - wchan = get_wchan(task); + wchan = !task_is_running(task); if (!whole) { min_flt = task->min_flt; maj_flt = task->maj_flt; @@ -606,10 +606,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, * * This works with older implementations of procps as well. */ - if (wchan) - seq_puts(m, " 1"); - else - seq_puts(m, " 0"); + seq_put_decimal_ull(m, " ", wchan); seq_put_decimal_ull(m, " ", 0); seq_put_decimal_ull(m, " ", 0); -- cgit From bc9bbb81730ea667c31c5b284f95ee312bab466f Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 29 Sep 2021 15:02:17 -0700 Subject: x86: Fix get_wchan() to support the ORC unwinder Currently, the kernel CONFIG_UNWINDER_ORC option is enabled by default on x86, but the implementation of get_wchan() is still based on the frame pointer unwinder, so the /proc//wchan usually returned 0 regardless of whether the task is running. Reimplement get_wchan() by calling stack_trace_save_tsk(), which is adapted to the ORC and frame pointer unwinders. Fixes: ee9f8fce9964 ("x86/unwind: Add the ORC unwinder") Signed-off-by: Qi Zheng Signed-off-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211008111626.271115116@infradead.org --- arch/x86/kernel/process.c | 51 +++-------------------------------------------- 1 file changed, 3 insertions(+), 48 deletions(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 1d9463e3096b..e645925f9f02 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -944,58 +944,13 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) */ unsigned long get_wchan(struct task_struct *p) { - unsigned long start, bottom, top, sp, fp, ip, ret = 0; - int count = 0; + unsigned long entry = 0; if (p == current || task_is_running(p)) return 0; - if (!try_get_task_stack(p)) - return 0; - - start = (unsigned long)task_stack_page(p); - if (!start) - goto out; - - /* - * Layout of the stack page: - * - * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long) - * PADDING - * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING - * stack - * ----------- bottom = start - * - * The tasks stack pointer points at the location where the - * framepointer is stored. The data on the stack is: - * ... IP FP ... IP FP - * - * We need to read FP and IP, so we need to adjust the upper - * bound by another unsigned long. - */ - top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; - top -= 2 * sizeof(unsigned long); - bottom = start; - - sp = READ_ONCE(p->thread.sp); - if (sp < bottom || sp > top) - goto out; - - fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp); - do { - if (fp < bottom || fp > top) - goto out; - ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long))); - if (!in_sched_functions(ip)) { - ret = ip; - goto out; - } - fp = READ_ONCE_NOCHECK(*(unsigned long *)fp); - } while (count++ < 16 && !task_is_running(p)); - -out: - put_task_stack(p); - return ret; + stack_trace_save_tsk(p, &entry, 1, 0); + return entry; } long do_arch_prctl_common(struct task_struct *task, int option, -- cgit From 42a20f86dc19f9282d974df0ba4d226c865ab9dd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 29 Sep 2021 15:02:14 -0700 Subject: sched: Add wrapper for get_wchan() to keep task blocked Having a stable wchan means the process must be blocked and for it to stay that way while performing stack unwinding. Suggested-by: Peter Zijlstra Signed-off-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Acked-by: Geert Uytterhoeven Acked-by: Russell King (Oracle) [arm] Tested-by: Mark Rutland [arm64] Link: https://lkml.kernel.org/r/20211008111626.332092234@infradead.org --- arch/alpha/include/asm/processor.h | 2 +- arch/alpha/kernel/process.c | 5 ++--- arch/arc/include/asm/processor.h | 2 +- arch/arc/kernel/stacktrace.c | 4 ++-- arch/arm/include/asm/processor.h | 2 +- arch/arm/kernel/process.c | 4 +--- arch/arm64/include/asm/processor.h | 2 +- arch/arm64/kernel/process.c | 4 +--- arch/csky/include/asm/processor.h | 2 +- arch/csky/kernel/stacktrace.c | 5 ++--- arch/h8300/include/asm/processor.h | 2 +- arch/h8300/kernel/process.c | 5 +---- arch/hexagon/include/asm/processor.h | 2 +- arch/hexagon/kernel/process.c | 4 +--- arch/ia64/include/asm/processor.h | 2 +- arch/ia64/kernel/process.c | 5 +---- arch/m68k/include/asm/processor.h | 2 +- arch/m68k/kernel/process.c | 4 +--- arch/microblaze/include/asm/processor.h | 2 +- arch/microblaze/kernel/process.c | 2 +- arch/mips/include/asm/processor.h | 2 +- arch/mips/kernel/process.c | 8 +++----- arch/nds32/include/asm/processor.h | 2 +- arch/nds32/kernel/process.c | 7 +------ arch/nios2/include/asm/processor.h | 2 +- arch/nios2/kernel/process.c | 5 +---- arch/openrisc/include/asm/processor.h | 2 +- arch/openrisc/kernel/process.c | 2 +- arch/parisc/include/asm/processor.h | 2 +- arch/parisc/kernel/process.c | 5 +---- arch/powerpc/include/asm/processor.h | 2 +- arch/powerpc/kernel/process.c | 9 +++------ arch/riscv/include/asm/processor.h | 2 +- arch/riscv/kernel/stacktrace.c | 12 +++++------- arch/s390/include/asm/processor.h | 2 +- arch/s390/kernel/process.c | 4 ++-- arch/sh/include/asm/processor_32.h | 2 +- arch/sh/kernel/process_32.c | 5 +---- arch/sparc/include/asm/processor_32.h | 2 +- arch/sparc/include/asm/processor_64.h | 2 +- arch/sparc/kernel/process_32.c | 5 +---- arch/sparc/kernel/process_64.c | 5 +---- arch/um/include/asm/processor-generic.h | 2 +- arch/um/kernel/process.c | 5 +---- arch/x86/include/asm/processor.h | 2 +- arch/x86/kernel/process.c | 5 +---- arch/xtensa/include/asm/processor.h | 2 +- arch/xtensa/kernel/process.c | 5 +---- include/linux/sched.h | 1 + kernel/sched/core.c | 19 +++++++++++++++++++ 50 files changed, 80 insertions(+), 112 deletions(-) diff --git a/arch/alpha/include/asm/processor.h b/arch/alpha/include/asm/processor.h index 6100431da07a..090499c99c1c 100644 --- a/arch/alpha/include/asm/processor.h +++ b/arch/alpha/include/asm/processor.h @@ -42,7 +42,7 @@ extern void start_thread(struct pt_regs *, unsigned long, unsigned long); struct task_struct; extern void release_thread(struct task_struct *); -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) (task_pt_regs(tsk)->pc) diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index a5123ea426ce..5f8527081da9 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -376,12 +376,11 @@ thread_saved_pc(struct task_struct *t) } unsigned long -get_wchan(struct task_struct *p) +__get_wchan(struct task_struct *p) { unsigned long schedule_frame; unsigned long pc; - if (!p || p == current || task_is_running(p)) - return 0; + /* * This one depends on the frame size of schedule(). Do a * "disass schedule" in gdb to find the frame size. Also, the diff --git a/arch/arc/include/asm/processor.h b/arch/arc/include/asm/processor.h index f28afcf5c6d1..54db9d7bb562 100644 --- a/arch/arc/include/asm/processor.h +++ b/arch/arc/include/asm/processor.h @@ -70,7 +70,7 @@ struct task_struct; extern void start_thread(struct pt_regs * regs, unsigned long pc, unsigned long usp); -extern unsigned int get_wchan(struct task_struct *p); +extern unsigned int __get_wchan(struct task_struct *p); #endif /* !__ASSEMBLY__ */ diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c index c376ff3147e7..5372dc04e784 100644 --- a/arch/arc/kernel/stacktrace.c +++ b/arch/arc/kernel/stacktrace.c @@ -15,7 +15,7 @@ * = specifics of data structs where trace is saved(CONFIG_STACKTRACE etc) * * vineetg: March 2009 - * -Implemented correct versions of thread_saved_pc() and get_wchan() + * -Implemented correct versions of thread_saved_pc() and __get_wchan() * * rajeshwarr: 2008 * -Initial implementation @@ -248,7 +248,7 @@ void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) * Of course just returning schedule( ) would be pointless so unwind until * the function is not in schedular code */ -unsigned int get_wchan(struct task_struct *tsk) +unsigned int __get_wchan(struct task_struct *tsk) { return arc_unwind_core(tsk, NULL, __get_first_nonsched, NULL); } diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h index 9e6b97286307..6af68edfa53a 100644 --- a/arch/arm/include/asm/processor.h +++ b/arch/arm/include/asm/processor.h @@ -84,7 +84,7 @@ struct task_struct; /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); #define task_pt_regs(p) \ ((struct pt_regs *)(THREAD_START_SP + task_stack_page(p)) - 1) diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 0e2d3051741e..96f577e59595 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -276,13 +276,11 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, return 0; } -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { struct stackframe frame; unsigned long stack_page; int count = 0; - if (!p || p == current || task_is_running(p)) - return 0; frame.fp = thread_saved_fp(p); frame.sp = thread_saved_sp(p); diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index ee2bdc1b9f5b..55ca034238ea 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -257,7 +257,7 @@ struct task_struct; /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); void update_sctlr_el1(u64 sctlr); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 40adb8cdbf5a..aacf2f5559a8 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -528,13 +528,11 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, return last; } -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { struct stackframe frame; unsigned long stack_page, ret = 0; int count = 0; - if (!p || p == current || task_is_running(p)) - return 0; stack_page = (unsigned long)try_get_task_stack(p); if (!stack_page) diff --git a/arch/csky/include/asm/processor.h b/arch/csky/include/asm/processor.h index 9e933021fe8e..817dd60ff152 100644 --- a/arch/csky/include/asm/processor.h +++ b/arch/csky/include/asm/processor.h @@ -81,7 +81,7 @@ static inline void release_thread(struct task_struct *dead_task) extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) (task_pt_regs(tsk)->pc) #define KSTK_ESP(tsk) (task_pt_regs(tsk)->usp) diff --git a/arch/csky/kernel/stacktrace.c b/arch/csky/kernel/stacktrace.c index 1b280ef08004..9f78f5d21511 100644 --- a/arch/csky/kernel/stacktrace.c +++ b/arch/csky/kernel/stacktrace.c @@ -111,12 +111,11 @@ static bool save_wchan(unsigned long pc, void *arg) return false; } -unsigned long get_wchan(struct task_struct *task) +unsigned long __get_wchan(struct task_struct *task) { unsigned long pc = 0; - if (likely(task && task != current && !task_is_running(task))) - walk_stackframe(task, NULL, save_wchan, &pc); + walk_stackframe(task, NULL, save_wchan, &pc); return pc; } diff --git a/arch/h8300/include/asm/processor.h b/arch/h8300/include/asm/processor.h index a060b41b2d31..141a23eb62b7 100644 --- a/arch/h8300/include/asm/processor.h +++ b/arch/h8300/include/asm/processor.h @@ -105,7 +105,7 @@ static inline void release_thread(struct task_struct *dead_task) { } -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) \ ({ \ diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index 2ac27e4248a4..8833fa4f5d51 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -128,15 +128,12 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, return 0; } -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { unsigned long fp, pc; unsigned long stack_page; int count = 0; - if (!p || p == current || task_is_running(p)) - return 0; - stack_page = (unsigned long)p; fp = ((struct pt_regs *)p->thread.ksp)->er6; do { diff --git a/arch/hexagon/include/asm/processor.h b/arch/hexagon/include/asm/processor.h index 9f0cc99420be..615f7e49968e 100644 --- a/arch/hexagon/include/asm/processor.h +++ b/arch/hexagon/include/asm/processor.h @@ -64,7 +64,7 @@ struct thread_struct { extern void release_thread(struct task_struct *dead_task); /* Get wait channel for task P. */ -extern unsigned long get_wchan(struct task_struct *p); +extern unsigned long __get_wchan(struct task_struct *p); /* The following stuff is pretty HEXAGON specific. */ diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index 6a6835fb4242..232dfd8956aa 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -130,13 +130,11 @@ void flush_thread(void) * is an identification of the point at which the scheduler * was invoked by a blocked thread. */ -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { unsigned long fp, pc; unsigned long stack_page; int count = 0; - if (!p || p == current || task_is_running(p)) - return 0; stack_page = (unsigned long)task_stack_page(p); fp = ((struct hexagon_switch_stack *)p->thread.switch_sp)->fp; diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index 2d8bcdc27d7f..45365c2ef598 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h @@ -330,7 +330,7 @@ struct task_struct; #define release_thread(dead_task) /* Get wait channel for task P. */ -extern unsigned long get_wchan (struct task_struct *p); +extern unsigned long __get_wchan (struct task_struct *p); /* Return instruction pointer of blocked task TSK. */ #define KSTK_EIP(tsk) \ diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index e56d63f4abf9..834df24a88f1 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -523,15 +523,12 @@ exit_thread (struct task_struct *tsk) } unsigned long -get_wchan (struct task_struct *p) +__get_wchan (struct task_struct *p) { struct unw_frame_info info; unsigned long ip; int count = 0; - if (!p || p == current || task_is_running(p)) - return 0; - /* * Note: p may not be a blocked task (it could be current or * another process running on some other CPU. Rather than diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h index f4d82c619a5c..ffeda9aa526a 100644 --- a/arch/m68k/include/asm/processor.h +++ b/arch/m68k/include/asm/processor.h @@ -150,7 +150,7 @@ static inline void release_thread(struct task_struct *dead_task) { } -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) \ ({ \ diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index 1ab692b952cd..a6030dbaa089 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -263,13 +263,11 @@ int dump_fpu (struct pt_regs *regs, struct user_m68kfp_struct *fpu) } EXPORT_SYMBOL(dump_fpu); -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { unsigned long fp, pc; unsigned long stack_page; int count = 0; - if (!p || p == current || task_is_running(p)) - return 0; stack_page = (unsigned long)task_stack_page(p); fp = ((struct switch_stack *)p->thread.ksp)->a6; diff --git a/arch/microblaze/include/asm/processor.h b/arch/microblaze/include/asm/processor.h index 06c6e493590a..7e9e92670df3 100644 --- a/arch/microblaze/include/asm/processor.h +++ b/arch/microblaze/include/asm/processor.h @@ -68,7 +68,7 @@ static inline void release_thread(struct task_struct *dead_task) { } -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); /* The size allocated for kernel stacks. This _must_ be a power of two! */ # define KERNEL_STACK_SIZE 0x2000 diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index 62aa237180b6..5e2b91c1e8ce 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -112,7 +112,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, return 0; } -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { /* TBD (used by procfs) */ return 0; diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h index 0c3550c82b72..252ed38ce8c5 100644 --- a/arch/mips/include/asm/processor.h +++ b/arch/mips/include/asm/processor.h @@ -369,7 +369,7 @@ static inline void flush_thread(void) { } -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); #define __KSTK_TOS(tsk) ((unsigned long)task_stack_page(tsk) + \ THREAD_SIZE - 32 - sizeof(struct pt_regs)) diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index 95aa86fa6077..cbff1b974f88 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -511,7 +511,7 @@ static int __init frame_info_init(void) /* * Without schedule() frame info, result given by - * thread_saved_pc() and get_wchan() are not reliable. + * thread_saved_pc() and __get_wchan() are not reliable. */ if (schedule_mfi.pc_offset < 0) printk("Can't analyze schedule() prologue at %p\n", schedule); @@ -652,9 +652,9 @@ unsigned long unwind_stack(struct task_struct *task, unsigned long *sp, #endif /* - * get_wchan - a maintenance nightmare^W^Wpain in the ass ... + * __get_wchan - a maintenance nightmare^W^Wpain in the ass ... */ -unsigned long get_wchan(struct task_struct *task) +unsigned long __get_wchan(struct task_struct *task) { unsigned long pc = 0; #ifdef CONFIG_KALLSYMS @@ -662,8 +662,6 @@ unsigned long get_wchan(struct task_struct *task) unsigned long ra = 0; #endif - if (!task || task == current || task_is_running(task)) - goto out; if (!task_stack_page(task)) goto out; diff --git a/arch/nds32/include/asm/processor.h b/arch/nds32/include/asm/processor.h index b82369c7659d..e6bfc74972bb 100644 --- a/arch/nds32/include/asm/processor.h +++ b/arch/nds32/include/asm/processor.h @@ -83,7 +83,7 @@ extern struct task_struct *last_task_used_math; /* Prepare to copy thread state - unlazy all lazy status */ #define prepare_to_copy(tsk) do { } while (0) -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); #define cpu_relax() barrier() diff --git a/arch/nds32/kernel/process.c b/arch/nds32/kernel/process.c index 391895b54d13..49fab9e39cbf 100644 --- a/arch/nds32/kernel/process.c +++ b/arch/nds32/kernel/process.c @@ -233,15 +233,12 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t * fpu) EXPORT_SYMBOL(dump_fpu); -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { unsigned long fp, lr; unsigned long stack_start, stack_end; int count = 0; - if (!p || p == current || task_is_running(p)) - return 0; - if (IS_ENABLED(CONFIG_FRAME_POINTER)) { stack_start = (unsigned long)end_of_stack(p); stack_end = (unsigned long)task_stack_page(p) + THREAD_SIZE; @@ -258,5 +255,3 @@ unsigned long get_wchan(struct task_struct *p) } return 0; } - -EXPORT_SYMBOL(get_wchan); diff --git a/arch/nios2/include/asm/processor.h b/arch/nios2/include/asm/processor.h index 94bcb86f679f..b8125dfbcad2 100644 --- a/arch/nios2/include/asm/processor.h +++ b/arch/nios2/include/asm/processor.h @@ -69,7 +69,7 @@ static inline void release_thread(struct task_struct *dead_task) { } -extern unsigned long get_wchan(struct task_struct *p); +extern unsigned long __get_wchan(struct task_struct *p); #define task_pt_regs(p) \ ((struct pt_regs *)(THREAD_SIZE + task_stack_page(p)) - 1) diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c index 9ff37ba2bb60..f8ea522a1588 100644 --- a/arch/nios2/kernel/process.c +++ b/arch/nios2/kernel/process.c @@ -217,15 +217,12 @@ void dump(struct pt_regs *fp) pr_emerg("\n\n"); } -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { unsigned long fp, pc; unsigned long stack_page; int count = 0; - if (!p || p == current || task_is_running(p)) - return 0; - stack_page = (unsigned long)p; fp = ((struct switch_stack *)p->thread.ksp)->fp; /* ;dgt2 */ do { diff --git a/arch/openrisc/include/asm/processor.h b/arch/openrisc/include/asm/processor.h index ad53b3184885..aa1699c18add 100644 --- a/arch/openrisc/include/asm/processor.h +++ b/arch/openrisc/include/asm/processor.h @@ -73,7 +73,7 @@ struct thread_struct { void start_thread(struct pt_regs *regs, unsigned long nip, unsigned long sp); void release_thread(struct task_struct *); -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); #define cpu_relax() barrier() diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index b0698d9ce14f..3c0c91bcdcba 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -263,7 +263,7 @@ void dump_elf_thread(elf_greg_t *dest, struct pt_regs* regs) dest[35] = 0; } -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { /* TODO */ diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index eeb7da064289..85a2dbfe5278 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -273,7 +273,7 @@ struct mm_struct; /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -extern unsigned long get_wchan(struct task_struct *p); +extern unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) ((tsk)->thread.regs.iaoq[0]) #define KSTK_ESP(tsk) ((tsk)->thread.regs.gr[30]) diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 38ec4ae81239..4f562dee65a2 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -240,15 +240,12 @@ copy_thread(unsigned long clone_flags, unsigned long usp, } unsigned long -get_wchan(struct task_struct *p) +__get_wchan(struct task_struct *p) { struct unwind_frame_info info; unsigned long ip; int count = 0; - if (!p || p == current || task_is_running(p)) - return 0; - /* * These bracket the sleeping functions.. */ diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index f348e564f7dd..e39bd0ff69f3 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -300,7 +300,7 @@ struct thread_struct { #define task_pt_regs(tsk) ((tsk)->thread.regs) -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) ((tsk)->thread.regs? (tsk)->thread.regs->nip: 0) #define KSTK_ESP(tsk) ((tsk)->thread.regs? (tsk)->thread.regs->gpr[1]: 0) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 50436b52c213..406d7ee9e322 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -2111,14 +2111,11 @@ int validate_sp(unsigned long sp, struct task_struct *p, EXPORT_SYMBOL(validate_sp); -static unsigned long __get_wchan(struct task_struct *p) +static unsigned long ___get_wchan(struct task_struct *p) { unsigned long ip, sp; int count = 0; - if (!p || p == current || task_is_running(p)) - return 0; - sp = p->thread.ksp; if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD)) return 0; @@ -2137,14 +2134,14 @@ static unsigned long __get_wchan(struct task_struct *p) return 0; } -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { unsigned long ret; if (!try_get_task_stack(p)) return 0; - ret = __get_wchan(p); + ret = ___get_wchan(p); put_task_stack(p); diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 46b492c78cbb..0749924d9e55 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -66,7 +66,7 @@ static inline void release_thread(struct task_struct *dead_task) { } -extern unsigned long get_wchan(struct task_struct *p); +extern unsigned long __get_wchan(struct task_struct *p); static inline void wait_for_interrupt(void) diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 315db3d0229b..0fcdc0233fac 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -128,16 +128,14 @@ static bool save_wchan(void *arg, unsigned long pc) return true; } -unsigned long get_wchan(struct task_struct *task) +unsigned long __get_wchan(struct task_struct *task) { unsigned long pc = 0; - if (likely(task && task != current && !task_is_running(task))) { - if (!try_get_task_stack(task)) - return 0; - walk_stackframe(task, NULL, save_wchan, &pc); - put_task_stack(task); - } + if (!try_get_task_stack(task)) + return 0; + walk_stackframe(task, NULL, save_wchan, &pc); + put_task_stack(task); return pc; } diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 879b8e3f609c..f54c152bf2bf 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -192,7 +192,7 @@ static inline void release_thread(struct task_struct *tsk) { } void guarded_storage_release(struct task_struct *tsk); void gs_load_bc_cb(struct pt_regs *regs); -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); #define task_pt_regs(tsk) ((struct pt_regs *) \ (task_stack_page(tsk) + THREAD_SIZE) - 1) #define KSTK_EIP(tsk) (task_pt_regs(tsk)->psw.addr) diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 350e94d0cac2..e5dd46b1bff8 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -181,12 +181,12 @@ void execve_tail(void) asm volatile("sfpc %0" : : "d" (0)); } -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { struct unwind_state state; unsigned long ip = 0; - if (!p || p == current || task_is_running(p) || !task_stack_page(p)) + if (!task_stack_page(p)) return 0; if (!try_get_task_stack(p)) diff --git a/arch/sh/include/asm/processor_32.h b/arch/sh/include/asm/processor_32.h index aa92cc933889..45240ec6b85a 100644 --- a/arch/sh/include/asm/processor_32.h +++ b/arch/sh/include/asm/processor_32.h @@ -180,7 +180,7 @@ static inline void show_code(struct pt_regs *regs) } #endif -extern unsigned long get_wchan(struct task_struct *p); +extern unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) (task_pt_regs(tsk)->pc) #define KSTK_ESP(tsk) (task_pt_regs(tsk)->regs[15]) diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index 717de05c81f4..1c28e3cddb60 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -182,13 +182,10 @@ __switch_to(struct task_struct *prev, struct task_struct *next) return prev; } -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { unsigned long pc; - if (!p || p == current || task_is_running(p)) - return 0; - /* * The same comment as on the Alpha applies here, too ... */ diff --git a/arch/sparc/include/asm/processor_32.h b/arch/sparc/include/asm/processor_32.h index b6242f7771e9..647bf0ac7beb 100644 --- a/arch/sparc/include/asm/processor_32.h +++ b/arch/sparc/include/asm/processor_32.h @@ -89,7 +89,7 @@ static inline void start_thread(struct pt_regs * regs, unsigned long pc, /* Free all resources held by a thread. */ #define release_thread(tsk) do { } while(0) -unsigned long get_wchan(struct task_struct *); +unsigned long __get_wchan(struct task_struct *); #define task_pt_regs(tsk) ((tsk)->thread.kregs) #define KSTK_EIP(tsk) ((tsk)->thread.kregs->pc) diff --git a/arch/sparc/include/asm/processor_64.h b/arch/sparc/include/asm/processor_64.h index 5cf145f18f36..ae851e8fce4c 100644 --- a/arch/sparc/include/asm/processor_64.h +++ b/arch/sparc/include/asm/processor_64.h @@ -183,7 +183,7 @@ do { \ /* Free all resources held by a thread. */ #define release_thread(tsk) do { } while (0) -unsigned long get_wchan(struct task_struct *task); +unsigned long __get_wchan(struct task_struct *task); #define task_pt_regs(tsk) (task_thread_info(tsk)->kregs) #define KSTK_EIP(tsk) (task_pt_regs(tsk)->tpc) diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index bbbe0cfef746..2dc0bf9fe62e 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -365,7 +365,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, return 0; } -unsigned long get_wchan(struct task_struct *task) +unsigned long __get_wchan(struct task_struct *task) { unsigned long pc, fp, bias = 0; unsigned long task_base = (unsigned long) task; @@ -373,9 +373,6 @@ unsigned long get_wchan(struct task_struct *task) struct reg_window32 *rw; int count = 0; - if (!task || task == current || task_is_running(task)) - goto out; - fp = task_thread_info(task)->ksp + bias; do { /* Bogus frame pointer? */ diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index d1cc410d2f64..f5b2cac8669f 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -663,7 +663,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) return 0; } -unsigned long get_wchan(struct task_struct *task) +unsigned long __get_wchan(struct task_struct *task) { unsigned long pc, fp, bias = 0; struct thread_info *tp; @@ -671,9 +671,6 @@ unsigned long get_wchan(struct task_struct *task) unsigned long ret = 0; int count = 0; - if (!task || task == current || task_is_running(task)) - goto out; - tp = task_thread_info(task); bias = STACK_BIAS; fp = task_thread_info(task)->ksp + bias; diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h index b5cf0ed116d9..579692a40a55 100644 --- a/arch/um/include/asm/processor-generic.h +++ b/arch/um/include/asm/processor-generic.h @@ -106,6 +106,6 @@ extern struct cpuinfo_um boot_cpu_data; #define cache_line_size() (boot_cpu_data.cache_alignment) #define KSTK_REG(tsk, reg) get_thread_reg(reg, &tsk->thread.switch_buf) -extern unsigned long get_wchan(struct task_struct *p); +extern unsigned long __get_wchan(struct task_struct *p); #endif diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 457a38db368b..82107373ac7e 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -364,14 +364,11 @@ unsigned long arch_align_stack(unsigned long sp) } #endif -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { unsigned long stack_page, sp, ip; bool seen_sched = 0; - if ((p == NULL) || (p == current) || task_is_running(p)) - return 0; - stack_page = (unsigned long) task_stack_page(p); /* Bail if the process has no kernel stack for some reason */ if (stack_page == 0) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 9ad2acaaae9b..e81177edc681 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -589,7 +589,7 @@ static inline void load_sp0(unsigned long sp0) /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); /* * Generic CPUID function diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e645925f9f02..a69885a9d711 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -942,13 +942,10 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) * because the task might wake up and we might look at a stack * changing under us. */ -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { unsigned long entry = 0; - if (p == current || task_is_running(p)) - return 0; - stack_trace_save_tsk(p, &entry, 1, 0); return entry; } diff --git a/arch/xtensa/include/asm/processor.h b/arch/xtensa/include/asm/processor.h index 7f63aca6a0d3..ad15fbc57283 100644 --- a/arch/xtensa/include/asm/processor.h +++ b/arch/xtensa/include/asm/processor.h @@ -215,7 +215,7 @@ struct mm_struct; /* Free all resources held by a thread. */ #define release_thread(thread) do { } while(0) -extern unsigned long get_wchan(struct task_struct *p); +extern unsigned long __get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) (task_pt_regs(tsk)->pc) #define KSTK_ESP(tsk) (task_pt_regs(tsk)->areg[1]) diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 060165340612..47f933fed870 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -298,15 +298,12 @@ int copy_thread(unsigned long clone_flags, unsigned long usp_thread_fn, * These bracket the sleeping functions.. */ -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { unsigned long sp, pc; unsigned long stack_page = (unsigned long) task_stack_page(p); int count = 0; - if (!p || p == current || task_is_running(p)) - return 0; - sp = p->thread.sp; pc = MAKE_PC_FROM_RA(p->thread.ra, p->thread.sp); diff --git a/include/linux/sched.h b/include/linux/sched.h index 343603f77f8b..5f8db54226af 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2139,6 +2139,7 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) #endif /* CONFIG_SMP */ extern bool sched_task_on_rq(struct task_struct *p); +extern unsigned long get_wchan(struct task_struct *p); /* * In order to reduce various lock holder preemption latencies provide an diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 935c2da00339..f2611b9cf503 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1966,6 +1966,25 @@ bool sched_task_on_rq(struct task_struct *p) return task_on_rq_queued(p); } +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long ip = 0; + unsigned int state; + + if (!p || p == current) + return 0; + + /* Only get wchan if task is blocked and we can keep it that way. */ + raw_spin_lock_irq(&p->pi_lock); + state = READ_ONCE(p->__state); + smp_rmb(); /* see try_to_wake_up() */ + if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq) + ip = __get_wchan(p); + raw_spin_unlock_irq(&p->pi_lock); + + return ip; +} + static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { if (!(flags & ENQUEUE_NOCLOCK)) -- cgit From 37b47298ab864fb3f5488ddebfc35267ceab0553 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Oct 2021 16:08:07 +0200 Subject: sched: Disable -Wunused-but-set-variable The compilers can't deal with obvious DCE vs that warning, resulting in code like: if (0) { sched sched_statistics *stats; stats = __schedstats_from_se(se); ... } triggering the warning. Kill the warning to make the robots stop reporting this. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Nathan Chancellor Link: https://lkml.kernel.org/r/YWWPLnaZGybHsTkv@hirez.programming.kicks-ass.net --- kernel/sched/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 978fcfca5871..c7421f2d05e1 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -3,6 +3,10 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE) endif +# The compilers are complaining about unused variables inside an if(0) scope +# block. This is daft, shut them up. +ccflags-y += $(call cc-disable-warning, unused-but-set-variable) + # These files are disabled because they produce non-interesting flaky coverage # that is not a function of syscall inputs. E.g. involuntary context switches. KCOV_INSTRUMENT := n -- cgit From c5e22feffdd736cb02b98b0f5b375c8ebc858dd4 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 24 Sep 2021 20:51:02 +1200 Subject: topology: Represent clusters of CPUs within a die Both ACPI and DT provide the ability to describe additional layers of topology between that of individual cores and higher level constructs such as the level at which the last level cache is shared. In ACPI this can be represented in PPTT as a Processor Hierarchy Node Structure [1] that is the parent of the CPU cores and in turn has a parent Processor Hierarchy Nodes Structure representing a higher level of topology. For example Kunpeng 920 has 6 or 8 clusters in each NUMA node, and each cluster has 4 cpus. All clusters share L3 cache data, but each cluster has local L3 tag. On the other hand, each clusters will share some internal system bus. +-----------------------------------+ +---------+ | +------+ +------+ +--------------------------+ | | | CPU0 | | cpu1 | | +-----------+ | | | +------+ +------+ | | | | | | +----+ L3 | | | | +------+ +------+ cluster | | tag | | | | | CPU2 | | CPU3 | | | | | | | +------+ +------+ | +-----------+ | | | | | | +-----------------------------------+ | | +-----------------------------------+ | | | +------+ +------+ +--------------------------+ | | | | | | | +-----------+ | | | +------+ +------+ | | | | | | | | L3 | | | | +------+ +------+ +----+ tag | | | | | | | | | | | | | | +------+ +------+ | +-----------+ | | | | | | +-----------------------------------+ | L3 | | data | +-----------------------------------+ | | | +------+ +------+ | +-----------+ | | | | | | | | | | | | | +------+ +------+ +----+ L3 | | | | | | tag | | | | +------+ +------+ | | | | | | | | | | | +-----------+ | | | +------+ +------+ +--------------------------+ | +-----------------------------------| | | +-----------------------------------| | | | +------+ +------+ +--------------------------+ | | | | | | | +-----------+ | | | +------+ +------+ | | | | | | +----+ L3 | | | | +------+ +------+ | | tag | | | | | | | | | | | | | | +------+ +------+ | +-----------+ | | | | | | +-----------------------------------+ | | +-----------------------------------+ | | | +------+ +------+ +--------------------------+ | | | | | | | +-----------+ | | | +------+ +------+ | | | | | | | | L3 | | | | +------+ +------+ +---+ tag | | | | | | | | | | | | | | +------+ +------+ | +-----------+ | | | | | | +-----------------------------------+ | | +-----------------------------------+ | | | +------+ +------+ +--------------------------+ | | | | | | | +-----------+ | | | +------+ +------+ | | | | | | | | L3 | | | | +------+ +------+ +--+ tag | | | | | | | | | | | | | | +------+ +------+ | +-----------+ | | | | +---------+ +-----------------------------------+ That means spreading tasks among clusters will bring more bandwidth while packing tasks within one cluster will lead to smaller cache synchronization latency. So both kernel and userspace will have a chance to leverage this topology to deploy tasks accordingly to achieve either smaller cache latency within one cluster or an even distribution of load among clusters for higher throughput. This patch exposes cluster topology to both kernel and userspace. Libraried like hwloc will know cluster by cluster_cpus and related sysfs attributes. PoC of HWLOC support at [2]. Note this patch only handle the ACPI case. Special consideration is needed for SMT processors, where it is necessary to move 2 levels up the hierarchy from the leaf nodes (thus skipping the processor core level). Note that arm64 / ACPI does not provide any means of identifying a die level in the topology but that may be unrelate to the cluster level. [1] ACPI Specification 6.3 - section 5.2.29.1 processor hierarchy node structure (Type 0) [2] https://github.com/hisilicon/hwloc/tree/linux-cluster Signed-off-by: Jonathan Cameron Signed-off-by: Tian Tao Signed-off-by: Barry Song Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210924085104.44806-2-21cnbao@gmail.com --- Documentation/ABI/stable/sysfs-devices-system-cpu | 15 +++++ Documentation/admin-guide/cputopology.rst | 12 ++-- arch/arm64/kernel/topology.c | 2 + drivers/acpi/pptt.c | 67 +++++++++++++++++++++++ drivers/base/arch_topology.c | 15 +++++ drivers/base/topology.c | 10 ++++ include/linux/acpi.h | 5 ++ include/linux/arch_topology.h | 5 ++ include/linux/topology.h | 6 ++ 9 files changed, 133 insertions(+), 4 deletions(-) diff --git a/Documentation/ABI/stable/sysfs-devices-system-cpu b/Documentation/ABI/stable/sysfs-devices-system-cpu index 516dafea03eb..3965ce504484 100644 --- a/Documentation/ABI/stable/sysfs-devices-system-cpu +++ b/Documentation/ABI/stable/sysfs-devices-system-cpu @@ -42,6 +42,12 @@ Description: the CPU core ID of cpuX. Typically it is the hardware platform's architecture and platform dependent. Values: integer +What: /sys/devices/system/cpu/cpuX/topology/cluster_id +Description: the cluster ID of cpuX. Typically it is the hardware platform's + identifier (rather than the kernel's). The actual value is + architecture and platform dependent. +Values: integer + What: /sys/devices/system/cpu/cpuX/topology/book_id Description: the book ID of cpuX. Typically it is the hardware platform's identifier (rather than the kernel's). The actual value is @@ -85,6 +91,15 @@ Description: human-readable list of CPUs within the same die. The format is like 0-3, 8-11, 14,17. Values: decimal list. +What: /sys/devices/system/cpu/cpuX/topology/cluster_cpus +Description: internal kernel map of CPUs within the same cluster. +Values: hexadecimal bitmask. + +What: /sys/devices/system/cpu/cpuX/topology/cluster_cpus_list +Description: human-readable list of CPUs within the same cluster. + The format is like 0-3, 8-11, 14,17. +Values: decimal list. + What: /sys/devices/system/cpu/cpuX/topology/book_siblings Description: internal kernel map of cpuX's hardware threads within the same book_id. it's only used on s390. diff --git a/Documentation/admin-guide/cputopology.rst b/Documentation/admin-guide/cputopology.rst index b085dbac60a5..6b62e182baf4 100644 --- a/Documentation/admin-guide/cputopology.rst +++ b/Documentation/admin-guide/cputopology.rst @@ -19,11 +19,13 @@ these macros in include/asm-XXX/topology.h:: #define topology_physical_package_id(cpu) #define topology_die_id(cpu) + #define topology_cluster_id(cpu) #define topology_core_id(cpu) #define topology_book_id(cpu) #define topology_drawer_id(cpu) #define topology_sibling_cpumask(cpu) #define topology_core_cpumask(cpu) + #define topology_cluster_cpumask(cpu) #define topology_die_cpumask(cpu) #define topology_book_cpumask(cpu) #define topology_drawer_cpumask(cpu) @@ -39,10 +41,12 @@ not defined by include/asm-XXX/topology.h: 1) topology_physical_package_id: -1 2) topology_die_id: -1 -3) topology_core_id: 0 -4) topology_sibling_cpumask: just the given CPU -5) topology_core_cpumask: just the given CPU -6) topology_die_cpumask: just the given CPU +3) topology_cluster_id: -1 +4) topology_core_id: 0 +5) topology_sibling_cpumask: just the given CPU +6) topology_core_cpumask: just the given CPU +7) topology_cluster_cpumask: just the given CPU +8) topology_die_cpumask: just the given CPU For architectures that don't support books (CONFIG_SCHED_BOOK) there are no default definitions for topology_book_id() and topology_book_cpumask(). diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 4dd14a6620c1..9ab78ad826e2 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -103,6 +103,8 @@ int __init parse_acpi_topology(void) cpu_topology[cpu].thread_id = -1; cpu_topology[cpu].core_id = topology_id; } + topology_id = find_acpi_cpu_topology_cluster(cpu); + cpu_topology[cpu].cluster_id = topology_id; topology_id = find_acpi_cpu_topology_package(cpu); cpu_topology[cpu].package_id = topology_id; diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c index fe69dc518f31..701f61c01359 100644 --- a/drivers/acpi/pptt.c +++ b/drivers/acpi/pptt.c @@ -746,6 +746,73 @@ int find_acpi_cpu_topology_package(unsigned int cpu) ACPI_PPTT_PHYSICAL_PACKAGE); } +/** + * find_acpi_cpu_topology_cluster() - Determine a unique CPU cluster value + * @cpu: Kernel logical CPU number + * + * Determine a topology unique cluster ID for the given CPU/thread. + * This ID can then be used to group peers, which will have matching ids. + * + * The cluster, if present is the level of topology above CPUs. In a + * multi-thread CPU, it will be the level above the CPU, not the thread. + * It may not exist in single CPU systems. In simple multi-CPU systems, + * it may be equal to the package topology level. + * + * Return: -ENOENT if the PPTT doesn't exist, the CPU cannot be found + * or there is no toplogy level above the CPU.. + * Otherwise returns a value which represents the package for this CPU. + */ + +int find_acpi_cpu_topology_cluster(unsigned int cpu) +{ + struct acpi_table_header *table; + acpi_status status; + struct acpi_pptt_processor *cpu_node, *cluster_node; + u32 acpi_cpu_id; + int retval; + int is_thread; + + status = acpi_get_table(ACPI_SIG_PPTT, 0, &table); + if (ACPI_FAILURE(status)) { + acpi_pptt_warn_missing(); + return -ENOENT; + } + + acpi_cpu_id = get_acpi_id_for_cpu(cpu); + cpu_node = acpi_find_processor_node(table, acpi_cpu_id); + if (cpu_node == NULL || !cpu_node->parent) { + retval = -ENOENT; + goto put_table; + } + + is_thread = cpu_node->flags & ACPI_PPTT_ACPI_PROCESSOR_IS_THREAD; + cluster_node = fetch_pptt_node(table, cpu_node->parent); + if (cluster_node == NULL) { + retval = -ENOENT; + goto put_table; + } + if (is_thread) { + if (!cluster_node->parent) { + retval = -ENOENT; + goto put_table; + } + cluster_node = fetch_pptt_node(table, cluster_node->parent); + if (cluster_node == NULL) { + retval = -ENOENT; + goto put_table; + } + } + if (cluster_node->flags & ACPI_PPTT_ACPI_PROCESSOR_ID_VALID) + retval = cluster_node->acpi_processor_id; + else + retval = ACPI_PTR_DIFF(cluster_node, table); + +put_table: + acpi_put_table(table); + + return retval; +} + /** * find_acpi_cpu_topology_hetero_id() - Get a core architecture tag * @cpu: Kernel logical CPU number diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 43407665918f..fc0836f460fb 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -600,6 +600,11 @@ const struct cpumask *cpu_coregroup_mask(int cpu) return core_mask; } +const struct cpumask *cpu_clustergroup_mask(int cpu) +{ + return &cpu_topology[cpu].cluster_sibling; +} + void update_siblings_masks(unsigned int cpuid) { struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; @@ -617,6 +622,12 @@ void update_siblings_masks(unsigned int cpuid) if (cpuid_topo->package_id != cpu_topo->package_id) continue; + if (cpuid_topo->cluster_id == cpu_topo->cluster_id && + cpuid_topo->cluster_id != -1) { + cpumask_set_cpu(cpu, &cpuid_topo->cluster_sibling); + cpumask_set_cpu(cpuid, &cpu_topo->cluster_sibling); + } + cpumask_set_cpu(cpuid, &cpu_topo->core_sibling); cpumask_set_cpu(cpu, &cpuid_topo->core_sibling); @@ -635,6 +646,9 @@ static void clear_cpu_topology(int cpu) cpumask_clear(&cpu_topo->llc_sibling); cpumask_set_cpu(cpu, &cpu_topo->llc_sibling); + cpumask_clear(&cpu_topo->cluster_sibling); + cpumask_set_cpu(cpu, &cpu_topo->cluster_sibling); + cpumask_clear(&cpu_topo->core_sibling); cpumask_set_cpu(cpu, &cpu_topo->core_sibling); cpumask_clear(&cpu_topo->thread_sibling); @@ -650,6 +664,7 @@ void __init reset_cpu_topology(void) cpu_topo->thread_id = -1; cpu_topo->core_id = -1; + cpu_topo->cluster_id = -1; cpu_topo->package_id = -1; cpu_topo->llc_id = -1; diff --git a/drivers/base/topology.c b/drivers/base/topology.c index 43c0940643f5..8f2b641d0b8c 100644 --- a/drivers/base/topology.c +++ b/drivers/base/topology.c @@ -48,6 +48,9 @@ static DEVICE_ATTR_RO(physical_package_id); define_id_show_func(die_id); static DEVICE_ATTR_RO(die_id); +define_id_show_func(cluster_id); +static DEVICE_ATTR_RO(cluster_id); + define_id_show_func(core_id); static DEVICE_ATTR_RO(core_id); @@ -63,6 +66,10 @@ define_siblings_read_func(core_siblings, core_cpumask); static BIN_ATTR_RO(core_siblings, 0); static BIN_ATTR_RO(core_siblings_list, 0); +define_siblings_read_func(cluster_cpus, cluster_cpumask); +static BIN_ATTR_RO(cluster_cpus, 0); +static BIN_ATTR_RO(cluster_cpus_list, 0); + define_siblings_read_func(die_cpus, die_cpumask); static BIN_ATTR_RO(die_cpus, 0); static BIN_ATTR_RO(die_cpus_list, 0); @@ -94,6 +101,8 @@ static struct bin_attribute *bin_attrs[] = { &bin_attr_thread_siblings_list, &bin_attr_core_siblings, &bin_attr_core_siblings_list, + &bin_attr_cluster_cpus, + &bin_attr_cluster_cpus_list, &bin_attr_die_cpus, &bin_attr_die_cpus_list, &bin_attr_package_cpus, @@ -112,6 +121,7 @@ static struct bin_attribute *bin_attrs[] = { static struct attribute *default_attrs[] = { &dev_attr_physical_package_id.attr, &dev_attr_die_id.attr, + &dev_attr_cluster_id.attr, &dev_attr_core_id.attr, #ifdef CONFIG_SCHED_BOOK &dev_attr_book_id.attr, diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 974d497a897d..fbc2146050a4 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1353,6 +1353,7 @@ static inline int lpit_read_residency_count_address(u64 *address) #ifdef CONFIG_ACPI_PPTT int acpi_pptt_cpu_is_thread(unsigned int cpu); int find_acpi_cpu_topology(unsigned int cpu, int level); +int find_acpi_cpu_topology_cluster(unsigned int cpu); int find_acpi_cpu_topology_package(unsigned int cpu); int find_acpi_cpu_topology_hetero_id(unsigned int cpu); int find_acpi_cpu_cache_topology(unsigned int cpu, int level); @@ -1365,6 +1366,10 @@ static inline int find_acpi_cpu_topology(unsigned int cpu, int level) { return -EINVAL; } +static inline int find_acpi_cpu_topology_cluster(unsigned int cpu) +{ + return -EINVAL; +} static inline int find_acpi_cpu_topology_package(unsigned int cpu) { return -EINVAL; diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index f180240dc95f..b97cea83b25e 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -62,10 +62,12 @@ void topology_set_thermal_pressure(const struct cpumask *cpus, struct cpu_topology { int thread_id; int core_id; + int cluster_id; int package_id; int llc_id; cpumask_t thread_sibling; cpumask_t core_sibling; + cpumask_t cluster_sibling; cpumask_t llc_sibling; }; @@ -73,13 +75,16 @@ struct cpu_topology { extern struct cpu_topology cpu_topology[NR_CPUS]; #define topology_physical_package_id(cpu) (cpu_topology[cpu].package_id) +#define topology_cluster_id(cpu) (cpu_topology[cpu].cluster_id) #define topology_core_id(cpu) (cpu_topology[cpu].core_id) #define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling) #define topology_sibling_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) +#define topology_cluster_cpumask(cpu) (&cpu_topology[cpu].cluster_sibling) #define topology_llc_cpumask(cpu) (&cpu_topology[cpu].llc_sibling) void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); +const struct cpumask *cpu_clustergroup_mask(int cpu); void update_siblings_masks(unsigned int cpu); void remove_cpu_topology(unsigned int cpuid); void reset_cpu_topology(void); diff --git a/include/linux/topology.h b/include/linux/topology.h index 7634cd737061..80d27d717631 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -186,6 +186,9 @@ static inline int cpu_to_mem(int cpu) #ifndef topology_die_id #define topology_die_id(cpu) ((void)(cpu), -1) #endif +#ifndef topology_cluster_id +#define topology_cluster_id(cpu) ((void)(cpu), -1) +#endif #ifndef topology_core_id #define topology_core_id(cpu) ((void)(cpu), 0) #endif @@ -195,6 +198,9 @@ static inline int cpu_to_mem(int cpu) #ifndef topology_core_cpumask #define topology_core_cpumask(cpu) cpumask_of(cpu) #endif +#ifndef topology_cluster_cpumask +#define topology_cluster_cpumask(cpu) cpumask_of(cpu) +#endif #ifndef topology_die_cpumask #define topology_die_cpumask(cpu) cpumask_of(cpu) #endif -- cgit From 778c558f49a2cb3dc7b18a80ff515e82aa813627 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Fri, 24 Sep 2021 20:51:03 +1200 Subject: sched: Add cluster scheduler level in core and related Kconfig for ARM64 This patch adds scheduler level for clusters and automatically enables the load balance among clusters. It will directly benefit a lot of workload which loves more resources such as memory bandwidth, caches. Testing has widely been done in two different hardware configurations of Kunpeng920: 24 cores in one NUMA(6 clusters in each NUMA node); 32 cores in one NUMA(8 clusters in each NUMA node) Workload is running on either one NUMA node or four NUMA nodes, thus, this can estimate the effect of cluster spreading w/ and w/o NUMA load balance. * Stream benchmark: 4threads stream (on 1NUMA * 24cores = 24cores) stream stream w/o patch w/ patch MB/sec copy 29929.64 ( 0.00%) 32932.68 ( 10.03%) MB/sec scale 29861.10 ( 0.00%) 32710.58 ( 9.54%) MB/sec add 27034.42 ( 0.00%) 32400.68 ( 19.85%) MB/sec triad 27225.26 ( 0.00%) 31965.36 ( 17.41%) 6threads stream (on 1NUMA * 24cores = 24cores) stream stream w/o patch w/ patch MB/sec copy 40330.24 ( 0.00%) 42377.68 ( 5.08%) MB/sec scale 40196.42 ( 0.00%) 42197.90 ( 4.98%) MB/sec add 37427.00 ( 0.00%) 41960.78 ( 12.11%) MB/sec triad 37841.36 ( 0.00%) 42513.64 ( 12.35%) 12threads stream (on 1NUMA * 24cores = 24cores) stream stream w/o patch w/ patch MB/sec copy 52639.82 ( 0.00%) 53818.04 ( 2.24%) MB/sec scale 52350.30 ( 0.00%) 53253.38 ( 1.73%) MB/sec add 53607.68 ( 0.00%) 55198.82 ( 2.97%) MB/sec triad 54776.66 ( 0.00%) 56360.40 ( 2.89%) Thus, it could help memory-bound workload especially under medium load. Similar improvement is also seen in lkp-pbzip2: * lkp-pbzip2 benchmark 2-96 threads (on 4NUMA * 24cores = 96cores) lkp-pbzip2 lkp-pbzip2 w/o patch w/ patch Hmean tput-2 11062841.57 ( 0.00%) 11341817.51 * 2.52%* Hmean tput-5 26815503.70 ( 0.00%) 27412872.65 * 2.23%* Hmean tput-8 41873782.21 ( 0.00%) 43326212.92 * 3.47%* Hmean tput-12 61875980.48 ( 0.00%) 64578337.51 * 4.37%* Hmean tput-21 105814963.07 ( 0.00%) 111381851.01 * 5.26%* Hmean tput-30 150349470.98 ( 0.00%) 156507070.73 * 4.10%* Hmean tput-48 237195937.69 ( 0.00%) 242353597.17 * 2.17%* Hmean tput-79 360252509.37 ( 0.00%) 362635169.23 * 0.66%* Hmean tput-96 394571737.90 ( 0.00%) 400952978.48 * 1.62%* 2-24 threads (on 1NUMA * 24cores = 24cores) lkp-pbzip2 lkp-pbzip2 w/o patch w/ patch Hmean tput-2 11071705.49 ( 0.00%) 11296869.10 * 2.03%* Hmean tput-4 20782165.19 ( 0.00%) 21949232.15 * 5.62%* Hmean tput-6 30489565.14 ( 0.00%) 33023026.96 * 8.31%* Hmean tput-8 40376495.80 ( 0.00%) 42779286.27 * 5.95%* Hmean tput-12 61264033.85 ( 0.00%) 62995632.78 * 2.83%* Hmean tput-18 86697139.39 ( 0.00%) 86461545.74 ( -0.27%) Hmean tput-24 104854637.04 ( 0.00%) 104522649.46 * -0.32%* In the case of 6 threads and 8 threads, we see the greatest performance improvement. Similar improvement can be seen on lkp-pixz though the improvement is smaller: * lkp-pixz benchmark 2-24 threads lkp-pixz (on 1NUMA * 24cores = 24cores) lkp-pixz lkp-pixz w/o patch w/ patch Hmean tput-2 6486981.16 ( 0.00%) 6561515.98 * 1.15%* Hmean tput-4 11645766.38 ( 0.00%) 11614628.43 ( -0.27%) Hmean tput-6 15429943.96 ( 0.00%) 15957350.76 * 3.42%* Hmean tput-8 19974087.63 ( 0.00%) 20413746.98 * 2.20%* Hmean tput-12 28172068.18 ( 0.00%) 28751997.06 * 2.06%* Hmean tput-18 39413409.54 ( 0.00%) 39896830.55 * 1.23%* Hmean tput-24 49101815.85 ( 0.00%) 49418141.47 * 0.64%* * SPECrate benchmark 4,8,16 copies mcf_r(on 1NUMA * 32cores = 32cores) Base Base Run Time Rate ------- --------- 4 Copies w/o 580 (w/ 570) w/o 11.1 (w/ 11.3) 8 Copies w/o 647 (w/ 605) w/o 20.0 (w/ 21.4, +7%) 16 Copies w/o 844 (w/ 844) w/o 30.6 (w/ 30.6) 32 Copies(on 4NUMA * 32 cores = 128cores) [w/o patch] Base Base Base Benchmarks Copies Run Time Rate --------------- ------- --------- --------- 500.perlbench_r 32 584 87.2 * 502.gcc_r 32 503 90.2 * 505.mcf_r 32 745 69.4 * 520.omnetpp_r 32 1031 40.7 * 523.xalancbmk_r 32 597 56.6 * 525.x264_r 1 -- CE 531.deepsjeng_r 32 336 109 * 541.leela_r 32 556 95.4 * 548.exchange2_r 32 513 163 * 557.xz_r 32 530 65.2 * Est. SPECrate2017_int_base 80.3 [w/ patch] Base Base Base Benchmarks Copies Run Time Rate --------------- ------- --------- --------- 500.perlbench_r 32 580 87.8 (+0.688%) * 502.gcc_r 32 477 95.1 (+5.432%) * 505.mcf_r 32 644 80.3 (+13.574%) * 520.omnetpp_r 32 942 44.6 (+9.58%) * 523.xalancbmk_r 32 560 60.4 (+6.714%%) * 525.x264_r 1 -- CE 531.deepsjeng_r 32 337 109 (+0.000%) * 541.leela_r 32 554 95.6 (+0.210%) * 548.exchange2_r 32 515 163 (+0.000%) * 557.xz_r 32 524 66.0 (+1.227%) * Est. SPECrate2017_int_base 83.7 (+4.062%) On the other hand, it is slightly helpful to CPU-bound tasks like kernbench: * 24-96 threads kernbench (on 4NUMA * 24cores = 96cores) kernbench kernbench w/o cluster w/ cluster Min user-24 12054.67 ( 0.00%) 12024.19 ( 0.25%) Min syst-24 1751.51 ( 0.00%) 1731.68 ( 1.13%) Min elsp-24 600.46 ( 0.00%) 598.64 ( 0.30%) Min user-48 12361.93 ( 0.00%) 12315.32 ( 0.38%) Min syst-48 1917.66 ( 0.00%) 1892.73 ( 1.30%) Min elsp-48 333.96 ( 0.00%) 332.57 ( 0.42%) Min user-96 12922.40 ( 0.00%) 12921.17 ( 0.01%) Min syst-96 2143.94 ( 0.00%) 2110.39 ( 1.56%) Min elsp-96 211.22 ( 0.00%) 210.47 ( 0.36%) Amean user-24 12063.99 ( 0.00%) 12030.78 * 0.28%* Amean syst-24 1755.20 ( 0.00%) 1735.53 * 1.12%* Amean elsp-24 601.60 ( 0.00%) 600.19 ( 0.23%) Amean user-48 12362.62 ( 0.00%) 12315.56 * 0.38%* Amean syst-48 1921.59 ( 0.00%) 1894.95 * 1.39%* Amean elsp-48 334.10 ( 0.00%) 332.82 * 0.38%* Amean user-96 12925.27 ( 0.00%) 12922.63 ( 0.02%) Amean syst-96 2146.66 ( 0.00%) 2122.20 * 1.14%* Amean elsp-96 211.96 ( 0.00%) 211.79 ( 0.08%) Note this patch isn't an universal win, it might hurt those workload which can benefit from packing. Though tasks which want to take advantages of lower communication latency of one cluster won't necessarily been packed in one cluster while kernel is not aware of clusters, they have some chance to be randomly packed. But this patch will make them more likely spread. Signed-off-by: Barry Song Tested-by: Yicong Yang Signed-off-by: Peter Zijlstra (Intel) --- arch/arm64/Kconfig | 9 +++++++++ include/linux/sched/topology.h | 7 +++++++ include/linux/topology.h | 7 +++++++ kernel/sched/topology.c | 5 +++++ 4 files changed, 28 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 5c7ae4c3954b..d13677f4731d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -989,6 +989,15 @@ config SCHED_MC making when dealing with multi-core CPU chips at a cost of slightly increased overhead in some places. If unsure say N here. +config SCHED_CLUSTER + bool "Cluster scheduler support" + help + Cluster scheduler support improves the CPU scheduler's decision + making when dealing with machines that have clusters of CPUs. + Cluster usually means a couple of CPUs which are placed closely + by sharing mid-level caches, last-level cache tags or internal + busses. + config SCHED_SMT bool "SMT scheduler support" help diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 8f0f778b7c91..2f9166f6dec8 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -42,6 +42,13 @@ static inline int cpu_smt_flags(void) } #endif +#ifdef CONFIG_SCHED_CLUSTER +static inline int cpu_cluster_flags(void) +{ + return SD_SHARE_PKG_RESOURCES; +} +#endif + #ifdef CONFIG_SCHED_MC static inline int cpu_core_flags(void) { diff --git a/include/linux/topology.h b/include/linux/topology.h index 80d27d717631..0b3704ad13c8 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -212,6 +212,13 @@ static inline const struct cpumask *cpu_smt_mask(int cpu) } #endif +#if defined(CONFIG_SCHED_CLUSTER) && !defined(cpu_cluster_mask) +static inline const struct cpumask *cpu_cluster_mask(int cpu) +{ + return topology_cluster_cpumask(cpu); +} +#endif + static inline const struct cpumask *cpu_cpu_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 5af3edd34d6d..c1729f9a715f 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1638,6 +1638,11 @@ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, #endif + +#ifdef CONFIG_SCHED_CLUSTER + { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) }, +#endif + #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, #endif -- cgit From 66558b730f2533cc2bf2b74d51f5f80b81e2bad0 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Fri, 24 Sep 2021 20:51:04 +1200 Subject: sched: Add cluster scheduler level for x86 There are x86 CPU architectures (e.g. Jacobsville) where L2 cahce is shared among a cluster of cores instead of being exclusive to one single core. To prevent oversubscription of L2 cache, load should be balanced between such L2 clusters, especially for tasks with no shared data. On benchmark such as SPECrate mcf test, this change provides a boost to performance especially on medium load system on Jacobsville. on a Jacobsville that has 24 Atom cores, arranged into 6 clusters of 4 cores each, the benchmark number is as follow: Improvement over baseline kernel for mcf_r copies run time base rate 1 -0.1% -0.2% 6 25.1% 25.1% 12 18.8% 19.0% 24 0.3% 0.3% So this looks pretty good. In terms of the system's task distribution, some pretty bad clumping can be seen for the vanilla kernel without the L2 cluster domain for the 6 and 12 copies case. With the extra domain for cluster, the load does get evened out between the clusters. Note this patch isn't an universal win as spreading isn't necessarily a win, particually for those workload who can benefit from packing. Signed-off-by: Tim Chen Signed-off-by: Barry Song Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210924085104.44806-4-21cnbao@gmail.com --- arch/x86/Kconfig | 11 +++++++++++ arch/x86/include/asm/smp.h | 7 +++++++ arch/x86/include/asm/topology.h | 3 +++ arch/x86/kernel/cpu/cacheinfo.c | 1 + arch/x86/kernel/cpu/common.c | 3 +++ arch/x86/kernel/smpboot.c | 44 ++++++++++++++++++++++++++++++++++++++++- 6 files changed, 68 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ab83c22d274e..349e59b2f0e3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1001,6 +1001,17 @@ config NR_CPUS This is purely to save memory: each supported CPU adds about 8KB to the kernel image. +config SCHED_CLUSTER + bool "Cluster scheduler support" + depends on SMP + default y + help + Cluster scheduler support improves the CPU scheduler's decision + making when dealing with machines that have clusters of CPUs. + Cluster usually means a couple of CPUs which are placed closely + by sharing mid-level caches, last-level cache tags or internal + busses. + config SCHED_SMT def_bool y if SMP diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 630ff08532be..08b0e90623ad 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -16,7 +16,9 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); /* cpus sharing the last level cache: */ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); +DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id); +DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id); DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number); static inline struct cpumask *cpu_llc_shared_mask(int cpu) @@ -24,6 +26,11 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu) return per_cpu(cpu_llc_shared_map, cpu); } +static inline struct cpumask *cpu_l2c_shared_mask(int cpu) +{ + return per_cpu(cpu_l2c_shared_map, cpu); +} + DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid); diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 9239399e5491..cc164777e661 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -103,6 +103,7 @@ static inline void setup_node_to_cpumask_map(void) { } #include extern const struct cpumask *cpu_coregroup_mask(int cpu); +extern const struct cpumask *cpu_clustergroup_mask(int cpu); #define topology_logical_package_id(cpu) (cpu_data(cpu).logical_proc_id) #define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) @@ -113,7 +114,9 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu); extern unsigned int __max_die_per_package; #ifdef CONFIG_SMP +#define topology_cluster_id(cpu) (per_cpu(cpu_l2c_id, cpu)) #define topology_die_cpumask(cpu) (per_cpu(cpu_die_map, cpu)) +#define topology_cluster_cpumask(cpu) (cpu_clustergroup_mask(cpu)) #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index b5e36bd0425b..fe98a1465be6 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -846,6 +846,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) l2 = new_l2; #ifdef CONFIG_SMP per_cpu(cpu_llc_id, cpu) = l2_id; + per_cpu(cpu_l2c_id, cpu) = l2_id; #endif } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0f8885949e8c..1c7897c33327 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -85,6 +85,9 @@ u16 get_llc_id(unsigned int cpu) } EXPORT_SYMBOL_GPL(get_llc_id); +/* L2 cache ID of each logical CPU */ +DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id) = BAD_APICID; + /* correctly size the local cpu masks */ void __init setup_cpu_local_masks(void) { diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 85f6e242b6b4..5094ab0bae58 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -101,6 +101,8 @@ EXPORT_PER_CPU_SYMBOL(cpu_die_map); DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); + /* Per CPU bogomips and other parameters */ DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); @@ -464,6 +466,21 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } +static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + /* Do not match if we do not have a valid APICID for cpu: */ + if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID) + return false; + + /* Do not match if L2 cache id does not match: */ + if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2)) + return false; + + return topology_sane(c, o, "l2c"); +} + /* * Unlike the other levels, we do not enforce keeping a * multicore group inside a NUMA node. If this happens, we will @@ -523,7 +540,7 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) } -#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) +#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_CLUSTER) || defined(CONFIG_SCHED_MC) static inline int x86_sched_itmt_flags(void) { return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0; @@ -541,12 +558,21 @@ static int x86_smt_flags(void) return cpu_smt_flags() | x86_sched_itmt_flags(); } #endif +#ifdef CONFIG_SCHED_CLUSTER +static int x86_cluster_flags(void) +{ + return cpu_cluster_flags() | x86_sched_itmt_flags(); +} +#endif #endif static struct sched_domain_topology_level x86_numa_in_package_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, #endif +#ifdef CONFIG_SCHED_CLUSTER + { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) }, +#endif #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, #endif @@ -557,6 +583,9 @@ static struct sched_domain_topology_level x86_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, #endif +#ifdef CONFIG_SCHED_CLUSTER + { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) }, +#endif #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, #endif @@ -584,6 +613,7 @@ void set_cpu_sibling_map(int cpu) if (!has_mp) { cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu)); cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); + cpumask_set_cpu(cpu, cpu_l2c_shared_mask(cpu)); cpumask_set_cpu(cpu, topology_core_cpumask(cpu)); cpumask_set_cpu(cpu, topology_die_cpumask(cpu)); c->booted_cores = 1; @@ -602,6 +632,9 @@ void set_cpu_sibling_map(int cpu) if ((i == cpu) || (has_mp && match_llc(c, o))) link_mask(cpu_llc_shared_mask, cpu, i); + if ((i == cpu) || (has_mp && match_l2c(c, o))) + link_mask(cpu_l2c_shared_mask, cpu, i); + if ((i == cpu) || (has_mp && match_die(c, o))) link_mask(topology_die_cpumask, cpu, i); } @@ -652,6 +685,11 @@ const struct cpumask *cpu_coregroup_mask(int cpu) return cpu_llc_shared_mask(cpu); } +const struct cpumask *cpu_clustergroup_mask(int cpu) +{ + return cpu_l2c_shared_mask(cpu); +} + static void impress_friends(void) { int cpu; @@ -1335,6 +1373,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL); } /* @@ -1564,7 +1603,10 @@ static void remove_siblinginfo(int cpu) for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling)); + for_each_cpu(sibling, cpu_l2c_shared_mask(cpu)) + cpumask_clear_cpu(cpu, cpu_l2c_shared_mask(sibling)); cpumask_clear(cpu_llc_shared_mask(cpu)); + cpumask_clear(cpu_l2c_shared_mask(cpu)); cpumask_clear(topology_sibling_cpumask(cpu)); cpumask_clear(topology_core_cpumask(cpu)); cpumask_clear(topology_die_cpumask(cpu)); -- cgit From da6ff09943491819e077b94c284bf0a6b751c9b8 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 6 Oct 2021 13:18:49 +0200 Subject: sched/rt: Annotate the RT balancing logic irqwork as IRQ_WORK_HARD_IRQ The push-IPI logic for RT tasks expects to be invoked from hardirq context. One reason is that a RT task on the remote CPU would block the softirq processing on PREEMPT_RT and so avoid pulling / balancing the RT tasks as intended. Annotate root_domain::rto_push_work as IRQ_WORK_HARD_IRQ. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211006111852.1514359-2-bigeasy@linutronix.de --- kernel/sched/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index c1729f9a715f..e81246787560 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -526,7 +526,7 @@ static int init_rootdomain(struct root_domain *rd) #ifdef HAVE_RT_PUSH_IPI rd->rto_cpu = -1; raw_spin_lock_init(&rd->rto_lock); - init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); + rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func); #endif rd->visit_gen = 0; -- cgit From 810979682ccc98dbd83f341c18a2e556c30a7164 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 6 Oct 2021 13:18:50 +0200 Subject: irq_work: Allow irq_work_sync() to sleep if irq_work() no IRQ support. irq_work() triggers instantly an interrupt if supported by the architecture. Otherwise the work will be processed on the next timer tick. In worst case irq_work_sync() could spin up to a jiffy. irq_work_sync() is usually used in tear down context which is fully preemptible. Based on review irq_work_sync() is invoked from preemptible context and there is one waiter at a time. This qualifies it to use rcuwait for synchronisation. Let irq_work_sync() synchronize with rcuwait if the architecture processes irqwork via the timer tick. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211006111852.1514359-3-bigeasy@linutronix.de --- include/linux/irq_work.h | 3 +++ kernel/irq_work.c | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index ec2a47a81e42..b48955e9c920 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -3,6 +3,7 @@ #define _LINUX_IRQ_WORK_H #include +#include /* * An entry can be in one of four states: @@ -16,11 +17,13 @@ struct irq_work { struct __call_single_node node; void (*func)(struct irq_work *); + struct rcuwait irqwait; }; #define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \ .node = { .u_flags = (_flags), }, \ .func = (_func), \ + .irqwait = __RCUWAIT_INITIALIZER(irqwait), \ } #define IRQ_WORK_INIT(_func) __IRQ_WORK_INIT(_func, 0) diff --git a/kernel/irq_work.c b/kernel/irq_work.c index db8c248ebc8c..e789beda8297 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -160,6 +160,9 @@ void irq_work_single(void *arg) * else claimed it meanwhile. */ (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY); + + if (!arch_irq_work_has_interrupt()) + rcuwait_wake_up(&work->irqwait); } static void irq_work_run_list(struct llist_head *list) @@ -204,6 +207,13 @@ void irq_work_tick(void) void irq_work_sync(struct irq_work *work) { lockdep_assert_irqs_enabled(); + might_sleep(); + + if (!arch_irq_work_has_interrupt()) { + rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work), + TASK_UNINTERRUPTIBLE); + return; + } while (irq_work_is_busy(work)) cpu_relax(); -- cgit From b4c6f86ec2f648b5e6d4b04564fbc6d5351160a8 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 7 Oct 2021 11:26:46 +0200 Subject: irq_work: Handle some irq_work in a per-CPU thread on PREEMPT_RT The irq_work callback is invoked in hard IRQ context. By default all callbacks are scheduled for invocation right away (given supported by the architecture) except for the ones marked IRQ_WORK_LAZY which are delayed until the next timer-tick. While looking over the callbacks, some of them may acquire locks (spinlock_t, rwlock_t) which are transformed into sleeping locks on PREEMPT_RT and must not be acquired in hard IRQ context. Changing the locks into locks which could be acquired in this context will lead to other problems such as increased latencies if everything in the chain has IRQ-off locks. This will not solve all the issues as one callback has been noticed which invoked kref_put() and its callback invokes kfree() and this can not be invoked in hardirq context. Some callbacks are required to be invoked in hardirq context even on PREEMPT_RT to work properly. This includes for instance the NO_HZ callback which needs to be able to observe the idle context. The callbacks which require to be run in hardirq have already been marked. Use this information to split the callbacks onto the two lists on PREEMPT_RT: - lazy_list Work items which are not marked with IRQ_WORK_HARD_IRQ will be added to this list. Callbacks on this list will be invoked from a per-CPU thread. The handler here may acquire sleeping locks such as spinlock_t and invoke kfree(). - raised_list Work items which are marked with IRQ_WORK_HARD_IRQ will be added to this list. They will be invoked in hardirq context and must not acquire any sleeping locks. The wake up of the per-CPU thread occurs from irq_work handler/ hardirq context. The thread runs with lowest RT priority to ensure it runs before any SCHED_OTHER tasks do. [bigeasy: melt tglx's irq_work_tick_soft() which splits irq_work_tick() into a hard and soft variant. Collected fixes over time from Steven Rostedt and Mike Galbraith. Move to per-CPU threads instead of softirq as suggested by PeterZ.] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211007092646.uhshe3ut2wkrcfzv@linutronix.de --- kernel/irq_work.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 106 insertions(+), 12 deletions(-) diff --git a/kernel/irq_work.c b/kernel/irq_work.c index e789beda8297..90b6b56f92e9 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -18,11 +18,36 @@ #include #include #include +#include #include #include static DEFINE_PER_CPU(struct llist_head, raised_list); static DEFINE_PER_CPU(struct llist_head, lazy_list); +static DEFINE_PER_CPU(struct task_struct *, irq_workd); + +static void wake_irq_workd(void) +{ + struct task_struct *tsk = __this_cpu_read(irq_workd); + + if (!llist_empty(this_cpu_ptr(&lazy_list)) && tsk) + wake_up_process(tsk); +} + +#ifdef CONFIG_SMP +static void irq_work_wake(struct irq_work *entry) +{ + wake_irq_workd(); +} + +static DEFINE_PER_CPU(struct irq_work, irq_work_wakeup) = + IRQ_WORK_INIT_HARD(irq_work_wake); +#endif + +static int irq_workd_should_run(unsigned int cpu) +{ + return !llist_empty(this_cpu_ptr(&lazy_list)); +} /* * Claim the entry so that no one else will poke at it. @@ -52,15 +77,29 @@ void __weak arch_irq_work_raise(void) /* Enqueue on current CPU, work must already be claimed and preempt disabled */ static void __irq_work_queue_local(struct irq_work *work) { + struct llist_head *list; + bool rt_lazy_work = false; + bool lazy_work = false; + int work_flags; + + work_flags = atomic_read(&work->node.a_flags); + if (work_flags & IRQ_WORK_LAZY) + lazy_work = true; + else if (IS_ENABLED(CONFIG_PREEMPT_RT) && + !(work_flags & IRQ_WORK_HARD_IRQ)) + rt_lazy_work = true; + + if (lazy_work || rt_lazy_work) + list = this_cpu_ptr(&lazy_list); + else + list = this_cpu_ptr(&raised_list); + + if (!llist_add(&work->node.llist, list)) + return; + /* If the work is "lazy", handle it from next tick if any */ - if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) { - if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) && - tick_nohz_tick_stopped()) - arch_irq_work_raise(); - } else { - if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list))) - arch_irq_work_raise(); - } + if (!lazy_work || tick_nohz_tick_stopped()) + arch_irq_work_raise(); } /* Enqueue the irq work @work on the current CPU */ @@ -104,17 +143,34 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (cpu != smp_processor_id()) { /* Arch remote IPI send/receive backend aren't NMI safe */ WARN_ON_ONCE(in_nmi()); + + /* + * On PREEMPT_RT the items which are not marked as + * IRQ_WORK_HARD_IRQ are added to the lazy list and a HARD work + * item is used on the remote CPU to wake the thread. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && + !(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) { + + if (!llist_add(&work->node.llist, &per_cpu(lazy_list, cpu))) + goto out; + + work = &per_cpu(irq_work_wakeup, cpu); + if (!irq_work_claim(work)) + goto out; + } + __smp_call_single_queue(cpu, &work->node.llist); } else { __irq_work_queue_local(work); } +out: preempt_enable(); return true; #endif /* CONFIG_SMP */ } - bool irq_work_needs_cpu(void) { struct llist_head *raised, *lazy; @@ -170,7 +226,12 @@ static void irq_work_run_list(struct llist_head *list) struct irq_work *work, *tmp; struct llist_node *llnode; - BUG_ON(!irqs_disabled()); + /* + * On PREEMPT_RT IRQ-work which is not marked as HARD will be processed + * in a per-CPU thread in preemptible context. Only the items which are + * marked as IRQ_WORK_HARD_IRQ will be processed in hardirq context. + */ + BUG_ON(!irqs_disabled() && !IS_ENABLED(CONFIG_PREEMPT_RT)); if (llist_empty(list)) return; @@ -187,7 +248,10 @@ static void irq_work_run_list(struct llist_head *list) void irq_work_run(void) { irq_work_run_list(this_cpu_ptr(&raised_list)); - irq_work_run_list(this_cpu_ptr(&lazy_list)); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + irq_work_run_list(this_cpu_ptr(&lazy_list)); + else + wake_irq_workd(); } EXPORT_SYMBOL_GPL(irq_work_run); @@ -197,7 +261,11 @@ void irq_work_tick(void) if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) irq_work_run_list(raised); - irq_work_run_list(this_cpu_ptr(&lazy_list)); + + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + irq_work_run_list(this_cpu_ptr(&lazy_list)); + else + wake_irq_workd(); } /* @@ -219,3 +287,29 @@ void irq_work_sync(struct irq_work *work) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); + +static void run_irq_workd(unsigned int cpu) +{ + irq_work_run_list(this_cpu_ptr(&lazy_list)); +} + +static void irq_workd_setup(unsigned int cpu) +{ + sched_set_fifo_low(current); +} + +static struct smp_hotplug_thread irqwork_threads = { + .store = &irq_workd, + .setup = irq_workd_setup, + .thread_should_run = irq_workd_should_run, + .thread_fn = run_irq_workd, + .thread_comm = "irq_work/%u", +}; + +static __init int irq_work_init_threads(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + BUG_ON(smpboot_register_percpu_thread(&irqwork_threads)); + return 0; +} +early_initcall(irq_work_init_threads); -- cgit From 09089db79859cbccccd8df95b034f36f7027efa6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 6 Oct 2021 13:18:52 +0200 Subject: irq_work: Also rcuwait for !IRQ_WORK_HARD_IRQ on PREEMPT_RT On PREEMPT_RT most items are processed as LAZY via softirq context. Avoid to spin-wait for them because irq_work_sync() could have higher priority and not allow the irq-work to be completed. Wait additionally for !IRQ_WORK_HARD_IRQ irq_work items on PREEMPT_RT. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211006111852.1514359-5-bigeasy@linutronix.de --- include/linux/irq_work.h | 5 +++++ kernel/irq_work.c | 6 ++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index b48955e9c920..8cd11a223260 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -49,6 +49,11 @@ static inline bool irq_work_is_busy(struct irq_work *work) return atomic_read(&work->node.a_flags) & IRQ_WORK_BUSY; } +static inline bool irq_work_is_hard(struct irq_work *work) +{ + return atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ; +} + bool irq_work_queue(struct irq_work *work); bool irq_work_queue_on(struct irq_work *work, int cpu); diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 90b6b56f92e9..f7df715ec28e 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -217,7 +217,8 @@ void irq_work_single(void *arg) */ (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY); - if (!arch_irq_work_has_interrupt()) + if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || + !arch_irq_work_has_interrupt()) rcuwait_wake_up(&work->irqwait); } @@ -277,7 +278,8 @@ void irq_work_sync(struct irq_work *work) lockdep_assert_irqs_enabled(); might_sleep(); - if (!arch_irq_work_has_interrupt()) { + if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || + !arch_irq_work_has_interrupt()) { rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work), TASK_UNINTERRUPTIBLE); return; -- cgit From 96611c26dc351c33f73b48756a9feacc109e5bab Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Oct 2021 16:41:05 +0200 Subject: sched: Improve wake_up_all_idle_cpus() take #2 As reported by syzbot and experienced by Pavel, using cpus_read_lock() in wake_up_all_idle_cpus() generates lock inversion (against mmap_sem and possibly others). Instead, shrink the preempt disable region by iterating all CPUs and checking the online status for each individual CPU while having preemption disabled. Fixes: 8850cb663b5c ("sched: Simplify wake_up_*idle*()") Reported-by: syzbot+d5b23b18d2f4feae8a67@syzkaller.appspotmail.com Reported-by: Pavel Machek Reported-by: Qian Cai Signed-off-by: Peter Zijlstra (Intel) Tested-by: Qian Cai --- kernel/smp.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/kernel/smp.c b/kernel/smp.c index ad0b68a3a3d3..01a7c1706a58 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -1170,14 +1170,12 @@ void wake_up_all_idle_cpus(void) { int cpu; - cpus_read_lock(); - for_each_online_cpu(cpu) { - if (cpu == raw_smp_processor_id()) - continue; - - wake_up_if_idle(cpu); + for_each_possible_cpu(cpu) { + preempt_disable(); + if (cpu != smp_processor_id() && cpu_online(cpu)) + wake_up_if_idle(cpu); + preempt_enable(); } - cpus_read_unlock(); } EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); -- cgit From eaed27d0d01a89a510736d87f10cea02042b4756 Mon Sep 17 00:00:00 2001 From: Peng Wang Date: Tue, 19 Oct 2021 10:58:39 +0800 Subject: sched/core: Remove rq_relock() After the removal of migrate_tasks(), there is no user of rq_relock() left, so remove it. Signed-off-by: Peng Wang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/449948fdf9be4764b3929c52572917dd25eef758.1634611953.git.rocking@linux.alibaba.com --- kernel/sched/sched.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a00fc7057d97..f0b249ec581d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1615,14 +1615,6 @@ rq_lock(struct rq *rq, struct rq_flags *rf) rq_pin_lock(rq, rf); } -static inline void -rq_relock(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_rq_lock(rq); - rq_repin_lock(rq, rf); -} - static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) -- cgit From 55409ac5c371c6403012d5f4df5e7c6cf0e7dce6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Oct 2021 17:49:53 +0200 Subject: sched,x86: Fix L2 cache mask Currently AMD/Hygon do not populate l2c_id, this means that for SMT enabled systems they report an L2 per thread. This is ofcourse not true but was harmless so far. However, since commit: 66558b730f25 ("sched: Add cluster scheduler level for x86") the scheduler topology setup requires: SMT <= L2 <= LLC Which leads to noisy warnings and possibly weird behaviour on affected chips. Therefore change the topology generation such that if l2c_id is not populated it follows the SMT topology, thereby satisfying the constraint. Fixes: 66558b730f25 ("sched: Add cluster scheduler level for x86") Reported-by: Tom Lendacky Signed-off-by: Peter Zijlstra (Intel) Tested-by: Tom Lendacky --- arch/x86/kernel/smpboot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 5094ab0bae58..f80f4595ed41 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -470,9 +470,9 @@ static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { int cpu1 = c->cpu_index, cpu2 = o->cpu_index; - /* Do not match if we do not have a valid APICID for cpu: */ + /* If the arch didn't set up l2c_id, fall back to SMT */ if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID) - return false; + return match_smt(c, o); /* Do not match if L2 cache id does not match: */ if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2)) -- cgit From 5d1ceb3969b6b2e47e2df6d17790a7c5a20fcbb4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Oct 2021 16:53:02 +0200 Subject: x86: Fix __get_wchan() for !STACKTRACE Use asm/unwind.h to implement wchan, since we cannot always rely on STACKTRACE=y. Fixes: bc9bbb81730e ("x86: Fix get_wchan() to support the ORC unwinder") Reported-by: Stephen Rothwell Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kees Cook Link: https://lkml.kernel.org/r/20211022152104.137058575@infradead.org --- arch/x86/kernel/process.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index a69885a9d711..8fb6bd4cbc50 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -43,6 +43,7 @@ #include #include #include +#include #include "process.h" @@ -944,10 +945,20 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) */ unsigned long __get_wchan(struct task_struct *p) { - unsigned long entry = 0; + struct unwind_state state; + unsigned long addr = 0; - stack_trace_save_tsk(p, &entry, 1, 0); - return entry; + for (unwind_start(&state, p, NULL, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + if (!addr) + break; + if (in_sched_functions(addr)) + continue; + break; + } + + return addr; } long do_arch_prctl_common(struct task_struct *task, int option, -- cgit From 9e9af819db5dbe4bf99101628955a26e2a41a1a5 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 19 Oct 2021 14:35:33 +0200 Subject: sched/fair: Account update_blocked_averages in newidle_balance cost The time spent to update the blocked load can be significant depending of the complexity fo the cgroup hierarchy. Take this time into account in the cost of the 1st load balance of a newly idle cpu. Also reduce the number of call to sched_clock_cpu() and track more actual work. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Acked-by: Mel Gorman Link: https://lore.kernel.org/r/20211019123537.17146-2-vincent.guittot@linaro.org --- kernel/sched/fair.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 87db481e8a56..c0145677ee99 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10840,9 +10840,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; + u64 t0, t1, curr_cost = 0; struct sched_domain *sd; int pulled_task = 0; - u64 curr_cost = 0; update_misfit_status(NULL, this_rq); @@ -10887,11 +10887,13 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) raw_spin_rq_unlock(this_rq); + t0 = sched_clock_cpu(this_cpu); update_blocked_averages(this_cpu); + rcu_read_lock(); for_each_domain(this_cpu, sd) { int continue_balancing = 1; - u64 t0, domain_cost; + u64 domain_cost; if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { update_next_balance(sd, &next_balance); @@ -10899,17 +10901,18 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) } if (sd->flags & SD_BALANCE_NEWIDLE) { - t0 = sched_clock_cpu(this_cpu); pulled_task = load_balance(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &continue_balancing); - domain_cost = sched_clock_cpu(this_cpu) - t0; + t1 = sched_clock_cpu(this_cpu); + domain_cost = t1 - t0; if (domain_cost > sd->max_newidle_lb_cost) sd->max_newidle_lb_cost = domain_cost; curr_cost += domain_cost; + t0 = t1; } update_next_balance(sd, &next_balance); -- cgit From 9d783c8dd112ad4b619e74e4bf57d2be0b400693 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 19 Oct 2021 14:35:34 +0200 Subject: sched/fair: Skip update_blocked_averages if we are defering load balance In newidle_balance(), the scheduler skips load balance to the new idle cpu when the 1st sd of this_rq is: this_rq->avg_idle < sd->max_newidle_lb_cost Doing a costly call to update_blocked_averages() will not be useful and simply adds overhead when this condition is true. Check the condition early in newidle_balance() to skip update_blocked_averages() when possible. Signed-off-by: Vincent Guittot Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Acked-by: Mel Gorman Link: https://lore.kernel.org/r/20211019123537.17146-3-vincent.guittot@linaro.org --- kernel/sched/fair.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c0145677ee99..c4c36865321b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10873,17 +10873,20 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) */ rq_unpin_lock(this_rq, rf); + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(this_rq->sd); + if (this_rq->avg_idle < sysctl_sched_migration_cost || - !READ_ONCE(this_rq->rd->overload)) { + !READ_ONCE(this_rq->rd->overload) || + (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) { - rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) update_next_balance(sd, &next_balance); rcu_read_unlock(); goto out; } + rcu_read_unlock(); raw_spin_rq_unlock(this_rq); -- cgit From e60b56e46b384cee1ad34e6adc164d883049c6c3 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 19 Oct 2021 14:35:35 +0200 Subject: sched/fair: Wait before decaying max_newidle_lb_cost Decay max_newidle_lb_cost only when it has not been updated for a while and ensure to not decay a recently changed value. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Acked-by: Mel Gorman Link: https://lore.kernel.org/r/20211019123537.17146-4-vincent.guittot@linaro.org --- include/linux/sched/topology.h | 2 +- kernel/sched/fair.c | 36 +++++++++++++++++++++++++++--------- kernel/sched/topology.c | 2 +- 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 2f9166f6dec8..c07bfa2d80f2 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -105,7 +105,7 @@ struct sched_domain { /* idle_balance() stats */ u64 max_newidle_lb_cost; - unsigned long next_decay_max_lb_cost; + unsigned long last_decay_max_lb_cost; u64 avg_scan_cost; /* select_idle_sibling */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c4c36865321b..e50fd751e1df 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10239,6 +10239,30 @@ void update_max_interval(void) max_load_balance_interval = HZ*num_online_cpus()/10; } +static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) +{ + if (cost > sd->max_newidle_lb_cost) { + /* + * Track max cost of a domain to make sure to not delay the + * next wakeup on the CPU. + */ + sd->max_newidle_lb_cost = cost; + sd->last_decay_max_lb_cost = jiffies; + } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) { + /* + * Decay the newidle max times by ~1% per second to ensure that + * it is not outdated and the current max cost is actually + * shorter. + */ + sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256; + sd->last_decay_max_lb_cost = jiffies; + + return true; + } + + return false; +} + /* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. @@ -10262,14 +10286,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) for_each_domain(cpu, sd) { /* * Decay the newidle max times here because this is a regular - * visit to all the domains. Decay ~1% per second. + * visit to all the domains. */ - if (time_after(jiffies, sd->next_decay_max_lb_cost)) { - sd->max_newidle_lb_cost = - (sd->max_newidle_lb_cost * 253) / 256; - sd->next_decay_max_lb_cost = jiffies + HZ; - need_decay = 1; - } + need_decay = update_newidle_cost(sd, 0); max_cost += sd->max_newidle_lb_cost; /* @@ -10911,8 +10930,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) t1 = sched_clock_cpu(this_cpu); domain_cost = t1 - t0; - if (domain_cost > sd->max_newidle_lb_cost) - sd->max_newidle_lb_cost = domain_cost; + update_newidle_cost(sd, domain_cost); curr_cost += domain_cost; t0 = t1; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index e81246787560..30169c7685b6 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1568,7 +1568,7 @@ sd_init(struct sched_domain_topology_level *tl, .last_balance = jiffies, .balance_interval = sd_weight, .max_newidle_lb_cost = 0, - .next_decay_max_lb_cost = jiffies, + .last_decay_max_lb_cost = jiffies, .child = child, #ifdef CONFIG_SCHED_DEBUG .name = tl->name, -- cgit From c5b0a7eefc70150caf23e37bc9d639c68c87a097 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 19 Oct 2021 14:35:36 +0200 Subject: sched/fair: Remove sysctl_sched_migration_cost condition With a default value of 500us, sysctl_sched_migration_cost is significanlty higher than the cost of load_balance. Remove the condition and rely on the sd->max_newidle_lb_cost to abort newidle_balance. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Acked-by: Mel Gorman Link: https://lore.kernel.org/r/20211019123537.17146-5-vincent.guittot@linaro.org --- kernel/sched/fair.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e50fd751e1df..57eae0ebc492 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10895,8 +10895,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); - if (this_rq->avg_idle < sysctl_sched_migration_cost || - !READ_ONCE(this_rq->rd->overload) || + if (!READ_ONCE(this_rq->rd->overload) || (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) { if (sd) -- cgit From 8ea9183db4ad8afbcb7089a77c23eaf965b0cacd Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 19 Oct 2021 14:35:37 +0200 Subject: sched/fair: Cleanup newidle_balance update_next_balance() uses sd->last_balance which is not modified by load_balance() so we can merge the 2 calls in one place. No functional change Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Acked-by: Mel Gorman Link: https://lore.kernel.org/r/20211019123537.17146-6-vincent.guittot@linaro.org --- kernel/sched/fair.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 57eae0ebc492..13950beb01a2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10916,10 +10916,10 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) int continue_balancing = 1; u64 domain_cost; - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { - update_next_balance(sd, &next_balance); + update_next_balance(sd, &next_balance); + + if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) break; - } if (sd->flags & SD_BALANCE_NEWIDLE) { @@ -10935,8 +10935,6 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) t0 = t1; } - update_next_balance(sd, &next_balance); - /* * Stop searching for tasks to pull if there are * now runnable tasks on this rq. -- cgit