From dbfb089d360b1cc623c51a2c7cf9b99eff78e0e7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 3 Jul 2020 12:40:33 +0200 Subject: sched: Fix loadavg accounting race The recent commit: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") moved these lines in ttwu(): p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; up before: smp_cond_load_acquire(&p->on_cpu, !VAL); into the 'p->on_rq == 0' block, with the thinking that once we hit schedule() the current task cannot change it's ->state anymore. And while this is true, it is both incorrect and flawed. It is incorrect in that we need at least an ACQUIRE on 'p->on_rq == 0' to avoid weak hardware from re-ordering things for us. This can fairly easily be achieved by relying on the control-dependency already in place. The second problem, which makes the flaw in the original argument, is that while schedule() will not change prev->state, it will read it a number of times (arguably too many times since it's marked volatile). The previous condition 'p->on_cpu == 0' was sufficient because that indicates schedule() has completed, and will no longer read prev->state. So now the trick is to make this same true for the (much) earlier 'prev->on_rq == 0' case. Furthermore, in order to make the ordering stick, the 'prev->on_rq = 0' assignment needs to he a RELEASE, but adding additional ordering to schedule() is an unwelcome proposition at the best of times, doubly so for mere accounting. Luckily we can push the prev->state load up before rq->lock, with the only caveat that we then have to re-read the state after. However, we know that if it changed, we no longer have to worry about the blocking path. This gives us the required ordering, if we block, we did the prev->state load before an (effective) smp_mb() and the p->on_rq store needs not change. With this we end up with the effective ordering: LOAD p->state LOAD-ACQUIRE p->on_rq == 0 MB STORE p->on_rq, 0 STORE p->state, TASK_WAKING which ensures the TASK_WAKING store happens after the prev->state load, and all is well again. Fixes: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") Reported-by: Dave Jones Reported-by: Paul Gortmaker Signed-off-by: Peter Zijlstra (Intel) Tested-by: Dave Jones Tested-by: Paul Gortmaker Link: https://lkml.kernel.org/r/20200707102957.GN117543@hirez.programming.kicks-ass.net --- kernel/sched/core.c | 67 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 51 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ca5db40392d4..950ac45d5480 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1311,9 +1311,6 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) void activate_task(struct rq *rq, struct task_struct *p, int flags) { - if (task_contributes_to_load(p)) - rq->nr_uninterruptible--; - enqueue_task(rq, p, flags); p->on_rq = TASK_ON_RQ_QUEUED; @@ -1323,9 +1320,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; - if (task_contributes_to_load(p)) - rq->nr_uninterruptible++; - dequeue_task(rq, p, flags); } @@ -2236,10 +2230,10 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, lockdep_assert_held(&rq->lock); -#ifdef CONFIG_SMP if (p->sched_contributes_to_load) rq->nr_uninterruptible--; +#ifdef CONFIG_SMP if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; #endif @@ -2583,7 +2577,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). */ smp_rmb(); - if (p->on_rq && ttwu_remote(p, wake_flags)) + if (READ_ONCE(p->on_rq) && ttwu_remote(p, wake_flags)) goto unlock; if (p->in_iowait) { @@ -2592,9 +2586,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) } #ifdef CONFIG_SMP - p->sched_contributes_to_load = !!task_contributes_to_load(p); - p->state = TASK_WAKING; - /* * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be * possible to, falsely, observe p->on_cpu == 0. @@ -2613,8 +2604,20 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * Form a control-dep-acquire with p->on_rq == 0 above, to ensure + * schedule()'s deactivate_task() has 'happened' and p will no longer + * care about it's own p->state. See the comment in __schedule(). */ - smp_rmb(); + smp_acquire__after_ctrl_dep(); + + /* + * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq + * == 0), which means we need to do an enqueue, change p->state to + * TASK_WAKING such that we can unlock p->pi_lock before doing the + * enqueue, such as ttwu_queue_wakelist(). + */ + p->state = TASK_WAKING; /* * If the owning (remote) CPU is still in the middle of schedule() with @@ -4097,6 +4100,7 @@ static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; + unsigned long prev_state; struct rq_flags rf; struct rq *rq; int cpu; @@ -4113,12 +4117,22 @@ static void __sched notrace __schedule(bool preempt) local_irq_disable(); rcu_note_context_switch(preempt); + /* See deactivate_task() below. */ + prev_state = prev->state; + /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) - * done by the caller to avoid the race with signal_wake_up(). + * done by the caller to avoid the race with signal_wake_up(): + * + * __set_current_state(@state) signal_wake_up() + * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) + * wake_up_state(p, state) + * LOCK rq->lock LOCK p->pi_state + * smp_mb__after_spinlock() smp_mb__after_spinlock() + * if (signal_pending_state()) if (p->state & @state) * - * The membarrier system call requires a full memory barrier + * Also, the membarrier system call requires a full memory barrier * after coming from user-space, before storing to rq->curr. */ rq_lock(rq, &rf); @@ -4129,10 +4143,31 @@ static void __sched notrace __schedule(bool preempt) update_rq_clock(rq); switch_count = &prev->nivcsw; - if (!preempt && prev->state) { - if (signal_pending_state(prev->state, prev)) { + /* + * We must re-load prev->state in case ttwu_remote() changed it + * before we acquired rq->lock. + */ + if (!preempt && prev_state && prev_state == prev->state) { + if (signal_pending_state(prev_state, prev)) { prev->state = TASK_RUNNING; } else { + prev->sched_contributes_to_load = + (prev_state & TASK_UNINTERRUPTIBLE) && + !(prev_state & TASK_NOLOAD) && + !(prev->flags & PF_FROZEN); + + if (prev->sched_contributes_to_load) + rq->nr_uninterruptible++; + + /* + * __schedule() ttwu() + * prev_state = prev->state; if (READ_ONCE(p->on_rq) && ...) + * LOCK rq->lock goto out; + * smp_mb__after_spinlock(); smp_acquire__after_ctrl_dep(); + * p->on_rq = 0; p->state = TASK_WAKING; + * + * After this, schedule() must not care about p->state any more. + */ deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); if (prev->in_iowait) { -- cgit From ce3614daabea8a2d01c1dd17ae41d1ec5e5ae7db Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 6 Jul 2020 16:49:10 -0400 Subject: sched: Fix unreliable rseq cpu_id for new tasks While integrating rseq into glibc and replacing glibc's sched_getcpu implementation with rseq, glibc's tests discovered an issue with incorrect __rseq_abi.cpu_id field value right after the first time a newly created process issues sched_setaffinity. For the records, it triggers after building glibc and running tests, and then issuing: for x in {1..2000} ; do posix/tst-affinity-static & done and shows up as: error: Unexpected CPU 2, expected 0 error: Unexpected CPU 2, expected 0 error: Unexpected CPU 2, expected 0 error: Unexpected CPU 2, expected 0 error: Unexpected CPU 138, expected 0 error: Unexpected CPU 138, expected 0 error: Unexpected CPU 138, expected 0 error: Unexpected CPU 138, expected 0 This is caused by the scheduler invoking __set_task_cpu() directly from sched_fork() and wake_up_new_task(), thus bypassing rseq_migrate() which is done by set_task_cpu(). Add the missing rseq_migrate() to both functions. The only other direct use of __set_task_cpu() is done by init_idle(), which does not involve a user-space task. Based on my testing with the glibc test-case, just adding rseq_migrate() to wake_up_new_task() is sufficient to fix the observed issue. Also add it to sched_fork() to keep things consistent. The reason why this never triggered so far with the rseq/basic_test selftest is unclear. The current use of sched_getcpu(3) does not typically require it to be always accurate. However, use of the __rseq_abi.cpu_id field within rseq critical sections requires it to be accurate. If it is not accurate, it can cause corruption in the per-cpu data targeted by rseq critical sections in user-space. Reported-By: Florian Weimer Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Tested-By: Florian Weimer Cc: stable@vger.kernel.org # v4.18+ Link: https://lkml.kernel.org/r/20200707201505.2632-1-mathieu.desnoyers@efficios.com --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 950ac45d5480..e15543cb8481 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2965,6 +2965,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); + rseq_migrate(p); /* * We're setting the CPU for the first time, we don't migrate, * so use __set_task_cpu(). @@ -3029,6 +3030,7 @@ void wake_up_new_task(struct task_struct *p) * as we're not fully set-up yet. */ p->recent_used_cpu = task_cpu(p); + rseq_migrate(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); -- cgit From 01cfcde9c26d8555f0e6e9aea9d6049f87683998 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 10 Jul 2020 17:24:26 +0200 Subject: sched/fair: handle case of task_h_load() returning 0 task_h_load() can return 0 in some situations like running stress-ng mmapfork, which forks thousands of threads, in a sched group on a 224 cores system. The load balance doesn't handle this correctly because env->imbalance never decreases and it will stop pulling tasks only after reaching loop_max, which can be equal to the number of running tasks of the cfs. Make sure that imbalance will be decreased by at least 1. misfit task is the other feature that doesn't handle correctly such situation although it's probably more difficult to face the problem because of the smaller number of CPUs and running tasks on heterogenous system. We can't simply ensure that task_h_load() returns at least one because it would imply to handle underflow in other places. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Cc: # v4.4+ Link: https://lkml.kernel.org/r/20200710152426.16981-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 658aa7a2ae6f..04fa8dbcfa4d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4039,7 +4039,11 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) return; } - rq->misfit_task_load = task_h_load(p); + /* + * Make sure that misfit_task_load will not be null even if + * task_h_load() returns 0. + */ + rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); } #else /* CONFIG_SMP */ @@ -7638,7 +7642,14 @@ static int detach_tasks(struct lb_env *env) switch (env->migration_type) { case migrate_load: - load = task_h_load(p); + /* + * Depending of the number of CPUs and tasks and the + * cgroup hierarchy, task_h_load() can return a null + * value. Make sure that env->imbalance decreases + * otherwise detach_tasks() will stop only after + * detaching up to loop_max tasks. + */ + load = max_t(unsigned long, task_h_load(p), 1); if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) -- cgit