From 3ccfebedd8cf54e291c809c838d8ad5cc00f5688 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 29 Jan 2018 15:20:11 -0500 Subject: powerpc, membarrier: Skip memory barrier in switch_mm() Allow PowerPC to skip the full memory barrier in switch_mm(), and only issue the barrier when scheduling into a task belonging to a process that has registered to use expedited private. Threads targeting the same VM but which belong to different thread groups is a tricky case. It has a few consequences: It turns out that we cannot rely on get_nr_threads(p) to count the number of threads using a VM. We can use (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) instead to skip the synchronize_sched() for cases where the VM only has a single user, and that user only has a single thread. It also turns out that we cannot use for_each_thread() to set thread flags in all threads using a VM, as it only iterates on the thread group. Therefore, test the membarrier state variable directly rather than relying on thread flags. This means membarrier_register_private_expedited() needs to set the MEMBARRIER_STATE_PRIVATE_EXPEDITED flag, issue synchronize_sched(), and only then set MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY which allows private expedited membarrier commands to succeed. membarrier_arch_switch_mm() now tests for the MEMBARRIER_STATE_PRIVATE_EXPEDITED flag. Signed-off-by: Mathieu Desnoyers Acked-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Alan Stern Cc: Alexander Viro Cc: Andrea Parri Cc: Andrew Hunter Cc: Andy Lutomirski Cc: Avi Kivity Cc: Benjamin Herrenschmidt Cc: Boqun Feng Cc: Dave Watson Cc: David Sehr Cc: Greg Hackmann Cc: Linus Torvalds Cc: Maged Michael Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul E. McKenney Cc: Paul Mackerras Cc: Russell King Cc: Will Deacon Cc: linux-api@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20180129202020.8515-3-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 ---------- kernel/sched/membarrier.c | 8 ++++++++ 2 files changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3da7a2444a91..ead0c2135d47 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2698,16 +2698,6 @@ static struct rq *finish_task_switch(struct task_struct *prev) prev_state = prev->state; vtime_task_switch(prev); perf_event_task_sched_in(prev, current); - /* - * The membarrier system call requires a full memory barrier - * after storing to rq->curr, before going back to user-space. - * - * TODO: This smp_mb__after_unlock_lock can go away if PPC end - * up adding a full barrier to switch_mm(), or we should figure - * out if a smp_mb__after_unlock_lock is really the proper API - * to use. - */ - smp_mb__after_unlock_lock(); finish_task(prev); finish_lock_switch(rq); finish_arch_post_lock_switch(); diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 9bcbacba82a8..678577267a9a 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -118,6 +118,14 @@ static void membarrier_register_private_expedited(void) if (atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY) return; + atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state); + if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) { + /* + * Ensure all future scheduler executions will observe the + * new thread flag state for this process. + */ + synchronize_sched(); + } atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, &mm->membarrier_state); } -- cgit From 306e060435d7a3aef8f6f033e43b0f581638adce Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 29 Jan 2018 15:20:12 -0500 Subject: membarrier: Document scheduler barrier requirements Document the membarrier requirement on having a full memory barrier in __schedule() after coming from user-space, before storing to rq->curr. It is provided by smp_mb__after_spinlock() in __schedule(). Document that membarrier requires a full barrier on transition from kernel thread to userspace thread. We currently have an implicit barrier from atomic_dec_and_test() in mmdrop() that ensures this. The x86 switch_mm_irqs_off() full barrier is currently provided by many cpumask update operations as well as write_cr3(). Document that write_cr3() provides this barrier. Signed-off-by: Mathieu Desnoyers Acked-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Andrea Parri Cc: Andrew Hunter Cc: Andy Lutomirski Cc: Avi Kivity Cc: Benjamin Herrenschmidt Cc: Boqun Feng Cc: Dave Watson Cc: David Sehr Cc: Greg Hackmann Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Maged Michael Cc: Michael Ellerman Cc: Paul E. McKenney Cc: Paul Mackerras Cc: Russell King Cc: Will Deacon Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/20180129202020.8515-4-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ead0c2135d47..11bf4d48d2d3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2703,6 +2703,12 @@ static struct rq *finish_task_switch(struct task_struct *prev) finish_arch_post_lock_switch(); fire_sched_in_preempt_notifiers(current); + /* + * When transitioning from a kernel thread to a userspace + * thread, mmdrop()'s implicit full barrier is required by the + * membarrier system call, because the current ->active_mm can + * become the current mm without going through switch_mm(). + */ if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { @@ -2808,6 +2814,13 @@ context_switch(struct rq *rq, struct task_struct *prev, */ arch_start_context_switch(prev); + /* + * If mm is non-NULL, we pass through switch_mm(). If mm is + * NULL, we will pass through mmdrop() in finish_task_switch(). + * Both of these contain the full memory barrier required by + * membarrier after storing to rq->curr, before returning to + * user-space. + */ if (!mm) { next->active_mm = oldmm; mmgrab(oldmm); @@ -3344,6 +3357,9 @@ static void __sched notrace __schedule(bool preempt) * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * done by the caller to avoid the race with signal_wake_up(). + * + * The membarrier system call requires a full memory barrier + * after coming from user-space, before storing to rq->curr. */ rq_lock(rq, &rf); smp_mb__after_spinlock(); @@ -3391,17 +3407,16 @@ static void __sched notrace __schedule(bool preempt) /* * The membarrier system call requires each architecture * to have a full memory barrier after updating - * rq->curr, before returning to user-space. For TSO - * (e.g. x86), the architecture must provide its own - * barrier in switch_mm(). For weakly ordered machines - * for which spin_unlock() acts as a full memory - * barrier, finish_lock_switch() in common code takes - * care of this barrier. For weakly ordered machines for - * which spin_unlock() acts as a RELEASE barrier (only - * arm64 and PowerPC), arm64 has a full barrier in - * switch_to(), and PowerPC has - * smp_mb__after_unlock_lock() before - * finish_lock_switch(). + * rq->curr, before returning to user-space. + * + * Here are the schemes providing that barrier on the + * various architectures: + * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. + * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. + * - finish_lock_switch() for weakly-ordered + * architectures where spin_unlock is a full barrier, + * - switch_to() for arm64 (weakly-ordered, spin_unlock + * is a RELEASE barrier), */ ++*switch_count; -- cgit From c5f58bd58f432be5d92df33c5458e0bcbee3aadf Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 29 Jan 2018 15:20:13 -0500 Subject: membarrier: Provide GLOBAL_EXPEDITED command Allow expedited membarrier to be used for data shared between processes through shared memory. Processes wishing to receive the membarriers register with MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED. Those which want to issue membarrier invoke MEMBARRIER_CMD_GLOBAL_EXPEDITED. This allows extremely simple kernel-level implementation: we have almost everything we need with the PRIVATE_EXPEDITED barrier code. All we need to do is to add a flag in the mm_struct that will be used to check whether we need to send the IPI to the current thread of each CPU. There is a slight downside to this approach compared to targeting specific shared memory users: when performing a membarrier operation, all registered "global" receivers will get the barrier, even if they don't share a memory mapping with the sender issuing MEMBARRIER_CMD_GLOBAL_EXPEDITED. This registration approach seems to fit the requirement of not disturbing processes that really deeply care about real-time: they simply should not register with MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED. In order to align the membarrier command names, the "MEMBARRIER_CMD_SHARED" command is renamed to "MEMBARRIER_CMD_GLOBAL", keeping an alias of MEMBARRIER_CMD_SHARED to MEMBARRIER_CMD_GLOBAL for UAPI header backward compatibility. Signed-off-by: Mathieu Desnoyers Acked-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Andrea Parri Cc: Andrew Hunter Cc: Andy Lutomirski Cc: Avi Kivity Cc: Benjamin Herrenschmidt Cc: Boqun Feng Cc: Dave Watson Cc: David Sehr Cc: Greg Hackmann Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Maged Michael Cc: Michael Ellerman Cc: Paul E. McKenney Cc: Paul Mackerras Cc: Russell King Cc: Will Deacon Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/20180129202020.8515-5-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- kernel/sched/membarrier.c | 120 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 112 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 678577267a9a..d2087d5f9837 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -27,7 +27,9 @@ * except MEMBARRIER_CMD_QUERY. */ #define MEMBARRIER_CMD_BITMASK \ - (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ + (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ + | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ + | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED) static void ipi_mb(void *info) @@ -35,6 +37,73 @@ static void ipi_mb(void *info) smp_mb(); /* IPIs should be serializing but paranoid. */ } +static int membarrier_global_expedited(void) +{ + int cpu; + bool fallback = false; + cpumask_var_t tmpmask; + + if (num_online_cpus() == 1) + return 0; + + /* + * Matches memory barriers around rq->curr modification in + * scheduler. + */ + smp_mb(); /* system call entry is not a mb. */ + + /* + * Expedited membarrier commands guarantee that they won't + * block, hence the GFP_NOWAIT allocation flag and fallback + * implementation. + */ + if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { + /* Fallback for OOM. */ + fallback = true; + } + + cpus_read_lock(); + for_each_online_cpu(cpu) { + struct task_struct *p; + + /* + * Skipping the current CPU is OK even through we can be + * migrated at any point. The current CPU, at the point + * where we read raw_smp_processor_id(), is ensured to + * be in program order with respect to the caller + * thread. Therefore, we can skip this CPU from the + * iteration. + */ + if (cpu == raw_smp_processor_id()) + continue; + rcu_read_lock(); + p = task_rcu_dereference(&cpu_rq(cpu)->curr); + if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & + MEMBARRIER_STATE_GLOBAL_EXPEDITED)) { + if (!fallback) + __cpumask_set_cpu(cpu, tmpmask); + else + smp_call_function_single(cpu, ipi_mb, NULL, 1); + } + rcu_read_unlock(); + } + if (!fallback) { + preempt_disable(); + smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + preempt_enable(); + free_cpumask_var(tmpmask); + } + cpus_read_unlock(); + + /* + * Memory barrier on the caller thread _after_ we finished + * waiting for the last IPI. Matches memory barriers around + * rq->curr modification in scheduler. + */ + smp_mb(); /* exit from system call is not a mb */ + return 0; +} + static int membarrier_private_expedited(void) { int cpu; @@ -105,7 +174,38 @@ static int membarrier_private_expedited(void) return 0; } -static void membarrier_register_private_expedited(void) +static int membarrier_register_global_expedited(void) +{ + struct task_struct *p = current; + struct mm_struct *mm = p->mm; + + if (atomic_read(&mm->membarrier_state) & + MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY) + return 0; + atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); + if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) { + /* + * For single mm user, single threaded process, we can + * simply issue a memory barrier after setting + * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that + * no memory access following registration is reordered + * before registration. + */ + smp_mb(); + } else { + /* + * For multi-mm user threads, we need to ensure all + * future scheduler executions will observe the new + * thread flag state for this mm. + */ + synchronize_sched(); + } + atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, + &mm->membarrier_state); + return 0; +} + +static int membarrier_register_private_expedited(void) { struct task_struct *p = current; struct mm_struct *mm = p->mm; @@ -117,7 +217,7 @@ static void membarrier_register_private_expedited(void) */ if (atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY) - return; + return 0; atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state); if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) { /* @@ -128,6 +228,7 @@ static void membarrier_register_private_expedited(void) } atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, &mm->membarrier_state); + return 0; } /** @@ -167,21 +268,24 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) int cmd_mask = MEMBARRIER_CMD_BITMASK; if (tick_nohz_full_enabled()) - cmd_mask &= ~MEMBARRIER_CMD_SHARED; + cmd_mask &= ~MEMBARRIER_CMD_GLOBAL; return cmd_mask; } - case MEMBARRIER_CMD_SHARED: - /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */ + case MEMBARRIER_CMD_GLOBAL: + /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */ if (tick_nohz_full_enabled()) return -EINVAL; if (num_online_cpus() > 1) synchronize_sched(); return 0; + case MEMBARRIER_CMD_GLOBAL_EXPEDITED: + return membarrier_global_expedited(); + case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: + return membarrier_register_global_expedited(); case MEMBARRIER_CMD_PRIVATE_EXPEDITED: return membarrier_private_expedited(); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: - membarrier_register_private_expedited(); - return 0; + return membarrier_register_private_expedited(); default: return -EINVAL; } -- cgit From 70216e18e519a54a2f13569e8caff99a092a92d6 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 29 Jan 2018 15:20:17 -0500 Subject: membarrier: Provide core serializing command, *_SYNC_CORE Provide core serializing membarrier command to support memory reclaim by JIT. Each architecture needs to explicitly opt into that support by documenting in their architecture code how they provide the core serializing instructions required when returning from the membarrier IPI, and after the scheduler has updated the curr->mm pointer (before going back to user-space). They should then select ARCH_HAS_MEMBARRIER_SYNC_CORE to enable support for that command on their architecture. Architectures selecting this feature need to either document that they issue core serializing instructions when returning to user-space, or implement their architecture-specific sync_core_before_usermode(). Signed-off-by: Mathieu Desnoyers Acked-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Andrea Parri Cc: Andrew Hunter Cc: Andy Lutomirski Cc: Avi Kivity Cc: Benjamin Herrenschmidt Cc: Boqun Feng Cc: Dave Watson Cc: David Sehr Cc: Greg Hackmann Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Maged Michael Cc: Michael Ellerman Cc: Paul E. McKenney Cc: Paul Mackerras Cc: Russell King Cc: Will Deacon Cc: linux-api@vger.kernel.org Cc: linux-arch@vger.kernel.org Link: http://lkml.kernel.org/r/20180129202020.8515-9-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 18 +++++++++++----- kernel/sched/membarrier.c | 53 ++++++++++++++++++++++++++++++++++++----------- 2 files changed, 54 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 11bf4d48d2d3..ee420d78e674 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2704,13 +2704,21 @@ static struct rq *finish_task_switch(struct task_struct *prev) fire_sched_in_preempt_notifiers(current); /* - * When transitioning from a kernel thread to a userspace - * thread, mmdrop()'s implicit full barrier is required by the - * membarrier system call, because the current ->active_mm can - * become the current mm without going through switch_mm(). + * When switching through a kernel thread, the loop in + * membarrier_{private,global}_expedited() may have observed that + * kernel thread and not issued an IPI. It is therefore possible to + * schedule between user->kernel->user threads without passing though + * switch_mm(). Membarrier requires a barrier after storing to + * rq->curr, before returning to userspace, so provide them here: + * + * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly + * provided by mmdrop(), + * - a sync_core for SYNC_CORE. */ - if (mm) + if (mm) { + membarrier_mm_sync_core_before_usermode(mm); mmdrop(mm); + } if (unlikely(prev_state == TASK_DEAD)) { if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index d2087d5f9837..5d0762633639 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -26,11 +26,20 @@ * Bitmask made from a "or" of all commands within enum membarrier_cmd, * except MEMBARRIER_CMD_QUERY. */ +#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE +#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ + (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) +#else +#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 +#endif + #define MEMBARRIER_CMD_BITMASK \ (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ - | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED) + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ + | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) static void ipi_mb(void *info) { @@ -104,15 +113,23 @@ static int membarrier_global_expedited(void) return 0; } -static int membarrier_private_expedited(void) +static int membarrier_private_expedited(int flags) { int cpu; bool fallback = false; cpumask_var_t tmpmask; - if (!(atomic_read(¤t->mm->membarrier_state) - & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) - return -EPERM; + if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) + return -EINVAL; + if (!(atomic_read(¤t->mm->membarrier_state) & + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) + return -EPERM; + } else { + if (!(atomic_read(¤t->mm->membarrier_state) & + MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) + return -EPERM; + } if (num_online_cpus() == 1) return 0; @@ -205,20 +222,29 @@ static int membarrier_register_global_expedited(void) return 0; } -static int membarrier_register_private_expedited(void) +static int membarrier_register_private_expedited(int flags) { struct task_struct *p = current; struct mm_struct *mm = p->mm; + int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY; + + if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) + return -EINVAL; + state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; + } /* * We need to consider threads belonging to different thread * groups, which use the same mm. (CLONE_VM but not * CLONE_THREAD). */ - if (atomic_read(&mm->membarrier_state) - & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY) + if (atomic_read(&mm->membarrier_state) & state) return 0; atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state); + if (flags & MEMBARRIER_FLAG_SYNC_CORE) + atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE, + &mm->membarrier_state); if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) { /* * Ensure all future scheduler executions will observe the @@ -226,8 +252,7 @@ static int membarrier_register_private_expedited(void) */ synchronize_sched(); } - atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, - &mm->membarrier_state); + atomic_or(state, &mm->membarrier_state); return 0; } @@ -283,9 +308,13 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: return membarrier_register_global_expedited(); case MEMBARRIER_CMD_PRIVATE_EXPEDITED: - return membarrier_private_expedited(); + return membarrier_private_expedited(0); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: - return membarrier_register_private_expedited(); + return membarrier_register_private_expedited(0); + case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: + return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); + case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: + return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); default: return -EINVAL; } -- cgit From b85c8b71bf8de36a88f338f406f02a8cb48c5c3b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 16 Jan 2018 20:51:06 +0100 Subject: sched/core: Optimize ttwu_stat() The whole of ttwu_stat() is guarded by a single schedstat_enabled(), there is absolutely no point in then issuing another static_branch for every single schedstat_inc() in there. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 16 ++++++++-------- kernel/sched/stats.h | 2 ++ 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ee420d78e674..b40540e68104 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1630,16 +1630,16 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) #ifdef CONFIG_SMP if (cpu == rq->cpu) { - schedstat_inc(rq->ttwu_local); - schedstat_inc(p->se.statistics.nr_wakeups_local); + __schedstat_inc(rq->ttwu_local); + __schedstat_inc(p->se.statistics.nr_wakeups_local); } else { struct sched_domain *sd; - schedstat_inc(p->se.statistics.nr_wakeups_remote); + __schedstat_inc(p->se.statistics.nr_wakeups_remote); rcu_read_lock(); for_each_domain(rq->cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - schedstat_inc(sd->ttwu_wake_remote); + __schedstat_inc(sd->ttwu_wake_remote); break; } } @@ -1647,14 +1647,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } if (wake_flags & WF_MIGRATED) - schedstat_inc(p->se.statistics.nr_wakeups_migrate); + __schedstat_inc(p->se.statistics.nr_wakeups_migrate); #endif /* CONFIG_SMP */ - schedstat_inc(rq->ttwu_count); - schedstat_inc(p->se.statistics.nr_wakeups); + __schedstat_inc(rq->ttwu_count); + __schedstat_inc(p->se.statistics.nr_wakeups); if (wake_flags & WF_SYNC) - schedstat_inc(p->se.statistics.nr_wakeups_sync); + __schedstat_inc(p->se.statistics.nr_wakeups_sync); } static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index baf500d12b7c..61500bb97ed8 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -31,6 +31,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) rq->rq_sched_info.run_delay += delta; } #define schedstat_enabled() static_branch_unlikely(&sched_schedstats) +#define __schedstat_inc(var) do { var++; } while (0) #define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) #define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) @@ -48,6 +49,7 @@ static inline void rq_sched_info_depart(struct rq *rq, unsigned long long delta) {} #define schedstat_enabled() 0 +#define __schedstat_inc(var) do { } while (0) #define schedstat_inc(var) do { } while (0) #define schedstat_add(var, amt) do { } while (0) #define schedstat_set(var, val) do { } while (0) -- cgit From 2ed41a55023dc5be6742ca0eb8df5cb20e8dcaae Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Jan 2018 20:34:30 +0100 Subject: sched/core: Optimize update_stats_*() These functions are already gated by schedstats_enabled(), there is no point in then issuing another static_branch for every individual update in them. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 32 ++++++++++++++++---------------- kernel/sched/stats.h | 4 ++++ 2 files changed, 20 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7b6535987500..a6b8157197bf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -871,7 +871,7 @@ update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) likely(wait_start > prev_wait_start)) wait_start -= prev_wait_start; - schedstat_set(se->statistics.wait_start, wait_start); + __schedstat_set(se->statistics.wait_start, wait_start); } static inline void @@ -893,17 +893,17 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) * time stamp can be adjusted to accumulate wait time * prior to migration. */ - schedstat_set(se->statistics.wait_start, delta); + __schedstat_set(se->statistics.wait_start, delta); return; } trace_sched_stat_wait(p, delta); } - schedstat_set(se->statistics.wait_max, + __schedstat_set(se->statistics.wait_max, max(schedstat_val(se->statistics.wait_max), delta)); - schedstat_inc(se->statistics.wait_count); - schedstat_add(se->statistics.wait_sum, delta); - schedstat_set(se->statistics.wait_start, 0); + __schedstat_inc(se->statistics.wait_count); + __schedstat_add(se->statistics.wait_sum, delta); + __schedstat_set(se->statistics.wait_start, 0); } static inline void @@ -928,10 +928,10 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) delta = 0; if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) - schedstat_set(se->statistics.sleep_max, delta); + __schedstat_set(se->statistics.sleep_max, delta); - schedstat_set(se->statistics.sleep_start, 0); - schedstat_add(se->statistics.sum_sleep_runtime, delta); + __schedstat_set(se->statistics.sleep_start, 0); + __schedstat_add(se->statistics.sum_sleep_runtime, delta); if (tsk) { account_scheduler_latency(tsk, delta >> 10, 1); @@ -945,15 +945,15 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) delta = 0; if (unlikely(delta > schedstat_val(se->statistics.block_max))) - schedstat_set(se->statistics.block_max, delta); + __schedstat_set(se->statistics.block_max, delta); - schedstat_set(se->statistics.block_start, 0); - schedstat_add(se->statistics.sum_sleep_runtime, delta); + __schedstat_set(se->statistics.block_start, 0); + __schedstat_add(se->statistics.sum_sleep_runtime, delta); if (tsk) { if (tsk->in_iowait) { - schedstat_add(se->statistics.iowait_sum, delta); - schedstat_inc(se->statistics.iowait_count); + __schedstat_add(se->statistics.iowait_sum, delta); + __schedstat_inc(se->statistics.iowait_count); trace_sched_stat_iowait(tsk, delta); } @@ -1012,10 +1012,10 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) struct task_struct *tsk = task_of(se); if (tsk->state & TASK_INTERRUPTIBLE) - schedstat_set(se->statistics.sleep_start, + __schedstat_set(se->statistics.sleep_start, rq_clock(rq_of(cfs_rq))); if (tsk->state & TASK_UNINTERRUPTIBLE) - schedstat_set(se->statistics.block_start, + __schedstat_set(se->statistics.block_start, rq_clock(rq_of(cfs_rq))); } } diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 61500bb97ed8..8e7b58de61e7 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -33,7 +33,9 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) #define schedstat_enabled() static_branch_unlikely(&sched_schedstats) #define __schedstat_inc(var) do { var++; } while (0) #define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) +#define __schedstat_add(var, amt) do { var += (amt); } while (0) #define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) +#define __schedstat_set(var, val) do { var = (val); } while (0) #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) #define schedstat_val(var) (var) #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) @@ -51,7 +53,9 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) #define schedstat_enabled() 0 #define __schedstat_inc(var) do { } while (0) #define schedstat_inc(var) do { } while (0) +#define __schedstat_add(var, amt) do { } while (0) #define schedstat_add(var, amt) do { } while (0) +#define __schedstat_set(var, val) do { } while (0) #define schedstat_set(var, val) do { } while (0) #define schedstat_val(var) 0 #define schedstat_val_or_zero(var) 0 -- cgit From ad0f1d9d65938aec72a698116cd73a980916895e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 23 Jan 2018 20:45:37 -0500 Subject: sched/rt: Use container_of() to get root domain in rto_push_irq_work_func() When the rto_push_irq_work_func() is called, it looks at the RT overloaded bitmask in the root domain via the runqueue (rq->rd). The problem is that during CPU up and down, nothing here stops rq->rd from changing between taking the rq->rd->rto_lock and releasing it. That means the lock that is released is not the same lock that was taken. Instead of using this_rq()->rd to get the root domain, as the irq work is part of the root domain, we can simply get the root domain from the irq work that is passed to the routine: container_of(work, struct root_domain, rto_push_work) This keeps the root domain consistent. Reported-by: Pavan Kondeti Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 4bdced5c9a292 ("sched/rt: Simplify the IPI based RT balancing logic") Link: http://lkml.kernel.org/r/CAEU1=PkiHO35Dzna8EQqNSKW1fr1y1zRQ5y66X117MG06sQtNA@mail.gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 862a513adca3..2fb627d062e5 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1907,9 +1907,8 @@ static void push_rt_tasks(struct rq *rq) * the rt_loop_next will cause the iterator to perform another scan. * */ -static int rto_next_cpu(struct rq *rq) +static int rto_next_cpu(struct root_domain *rd) { - struct root_domain *rd = rq->rd; int next; int cpu; @@ -1985,7 +1984,7 @@ static void tell_cpu_to_push(struct rq *rq) * Otherwise it is finishing up and an ipi needs to be sent. */ if (rq->rd->rto_cpu < 0) - cpu = rto_next_cpu(rq); + cpu = rto_next_cpu(rq->rd); raw_spin_unlock(&rq->rd->rto_lock); @@ -1998,6 +1997,8 @@ static void tell_cpu_to_push(struct rq *rq) /* Called from hardirq context */ void rto_push_irq_work_func(struct irq_work *work) { + struct root_domain *rd = + container_of(work, struct root_domain, rto_push_work); struct rq *rq; int cpu; @@ -2013,18 +2014,18 @@ void rto_push_irq_work_func(struct irq_work *work) raw_spin_unlock(&rq->lock); } - raw_spin_lock(&rq->rd->rto_lock); + raw_spin_lock(&rd->rto_lock); /* Pass the IPI to the next rt overloaded queue */ - cpu = rto_next_cpu(rq); + cpu = rto_next_cpu(rd); - raw_spin_unlock(&rq->rd->rto_lock); + raw_spin_unlock(&rd->rto_lock); if (cpu < 0) return; /* Try the next RT overloaded CPU */ - irq_work_queue_on(&rq->rd->rto_push_work, cpu); + irq_work_queue_on(&rd->rto_push_work, cpu); } #endif /* HAVE_RT_PUSH_IPI */ -- cgit From 364f56653708ba8bcdefd4f0da2a42904baa8eeb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 23 Jan 2018 20:45:38 -0500 Subject: sched/rt: Up the root domain ref count when passing it around via IPIs When issuing an IPI RT push, where an IPI is sent to each CPU that has more than one RT task scheduled on it, it references the root domain's rto_mask, that contains all the CPUs within the root domain that has more than one RT task in the runable state. The problem is, after the IPIs are initiated, the rq->lock is released. This means that the root domain that is associated to the run queue could be freed while the IPIs are going around. Add a sched_get_rd() and a sched_put_rd() that will increment and decrement the root domain's ref count respectively. This way when initiating the IPIs, the scheduler will up the root domain's ref count before releasing the rq->lock, ensuring that the root domain does not go away until the IPI round is complete. Reported-by: Pavan Kondeti Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 4bdced5c9a292 ("sched/rt: Simplify the IPI based RT balancing logic") Link: http://lkml.kernel.org/r/CAEU1=PkiHO35Dzna8EQqNSKW1fr1y1zRQ5y66X117MG06sQtNA@mail.gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 9 +++++++-- kernel/sched/sched.h | 2 ++ kernel/sched/topology.c | 13 +++++++++++++ 3 files changed, 22 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2fb627d062e5..89a086ed2b16 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1990,8 +1990,11 @@ static void tell_cpu_to_push(struct rq *rq) rto_start_unlock(&rq->rd->rto_loop_start); - if (cpu >= 0) + if (cpu >= 0) { + /* Make sure the rd does not get freed while pushing */ + sched_get_rd(rq->rd); irq_work_queue_on(&rq->rd->rto_push_work, cpu); + } } /* Called from hardirq context */ @@ -2021,8 +2024,10 @@ void rto_push_irq_work_func(struct irq_work *work) raw_spin_unlock(&rd->rto_lock); - if (cpu < 0) + if (cpu < 0) { + sched_put_rd(rd); return; + } /* Try the next RT overloaded CPU */ irq_work_queue_on(&rd->rto_push_work, cpu); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2e95505e23c6..fb5fc458547f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -691,6 +691,8 @@ extern struct mutex sched_domains_mutex; extern void init_defrootdomain(void); extern int sched_init_domains(const struct cpumask *cpu_map); extern void rq_attach_root(struct rq *rq, struct root_domain *rd); +extern void sched_get_rd(struct root_domain *rd); +extern void sched_put_rd(struct root_domain *rd); #ifdef HAVE_RT_PUSH_IPI extern void rto_push_irq_work_func(struct irq_work *work); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 034cbed7f88b..519b024f4e94 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -259,6 +259,19 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) call_rcu_sched(&old_rd->rcu, free_rootdomain); } +void sched_get_rd(struct root_domain *rd) +{ + atomic_inc(&rd->refcount); +} + +void sched_put_rd(struct root_domain *rd) +{ + if (!atomic_dec_and_test(&rd->refcount)) + return; + + call_rcu_sched(&rd->rcu, free_rootdomain); +} + static int init_rootdomain(struct root_domain *rd) { if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) -- cgit From e7ad203166fff89b1d8253faf68fbe6966bf7181 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Mon, 5 Feb 2018 11:18:41 +0800 Subject: sched/rt: Make update_curr_rt() more accurate rq->clock_task may be updated between the two calls of rq_clock_task() in update_curr_rt(). Calling rq_clock_task() only once makes it more accurate and efficient, taking update_curr() as reference. Signed-off-by: Wen Yang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiang Biao Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: zhong.weidong@zte.com.cn Link: http://lkml.kernel.org/r/1517800721-42092-1-git-send-email-wen.yang99@zte.com.cn Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 89a086ed2b16..663b2355a3aa 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -950,12 +950,13 @@ static void update_curr_rt(struct rq *rq) { struct task_struct *curr = rq->curr; struct sched_rt_entity *rt_se = &curr->rt; + u64 now = rq_clock_task(rq); u64 delta_exec; if (curr->sched_class != &rt_sched_class) return; - delta_exec = rq_clock_task(rq) - curr->se.exec_start; + delta_exec = now - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) return; @@ -968,7 +969,7 @@ static void update_curr_rt(struct rq *rq) curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); - curr->se.exec_start = rq_clock_task(rq); + curr->se.exec_start = now; cgroup_account_cputime(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); -- cgit From 89a55f56fd1cdbe7e69d4693fc5790af9a6e1501 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 30 Jan 2018 10:45:52 +0000 Subject: sched/fair: Remove unnecessary parameters from wake_affine_idle() wake_affine_idle() takes parameters it never uses so clean it up. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Matt Fleming Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180130104555.4125-2-mgorman@techsingularity.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a6b8157197bf..0a551dfe54a0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5692,10 +5692,8 @@ static int wake_wide(struct task_struct *p) * scheduling latency of the CPUs. This seems to work * for the overloaded case. */ - static bool -wake_affine_idle(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int prev_cpu, int sync) +wake_affine_idle(int this_cpu, int prev_cpu, int sync) { /* * If this_cpu is idle, it implies the wakeup is from interrupt @@ -5752,8 +5750,8 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int this_cpu = smp_processor_id(); bool affine = false; - if (sched_feat(WA_IDLE) && !affine) - affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync); + if (sched_feat(WA_IDLE)) + affine = wake_affine_idle(this_cpu, prev_cpu, sync); if (sched_feat(WA_WEIGHT) && !affine) affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); -- cgit From 3b76c4a33959ca98a573cd9c94c8690d123912ca Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 30 Jan 2018 10:45:53 +0000 Subject: sched/fair: Restructure wake_affine*() to return a CPU id This is a preparation patch that has wake_affine*() return a CPU ID instead of a boolean. The intent is to allow the wake_affine() helpers to be avoided if a decision is already made. This patch has no functional change. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Matt Fleming Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180130104555.4125-3-mgorman@techsingularity.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0a551dfe54a0..4c400d79f1e5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5692,7 +5692,7 @@ static int wake_wide(struct task_struct *p) * scheduling latency of the CPUs. This seems to work * for the overloaded case. */ -static bool +static int wake_affine_idle(int this_cpu, int prev_cpu, int sync) { /* @@ -5702,15 +5702,15 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) * node depending on the IO topology or IRQ affinity settings. */ if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) - return true; + return this_cpu; if (sync && cpu_rq(this_cpu)->nr_running == 1) - return true; + return this_cpu; - return false; + return nr_cpumask_bits; } -static bool +static int wake_affine_weight(struct sched_domain *sd, struct task_struct *p, int this_cpu, int prev_cpu, int sync) { @@ -5724,7 +5724,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, unsigned long current_load = task_h_load(current); if (current_load > this_eff_load) - return true; + return this_cpu; this_eff_load -= current_load; } @@ -5741,28 +5741,28 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; prev_eff_load *= capacity_of(this_cpu); - return this_eff_load <= prev_eff_load; + return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits; } static int wake_affine(struct sched_domain *sd, struct task_struct *p, int prev_cpu, int sync) { int this_cpu = smp_processor_id(); - bool affine = false; + int target = nr_cpumask_bits; if (sched_feat(WA_IDLE)) - affine = wake_affine_idle(this_cpu, prev_cpu, sync); + target = wake_affine_idle(this_cpu, prev_cpu, sync); - if (sched_feat(WA_WEIGHT) && !affine) - affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); + if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits) + target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); - if (affine) { - schedstat_inc(sd->ttwu_move_affine); - schedstat_inc(p->se.statistics.nr_wakeups_affine); - } + if (target == nr_cpumask_bits) + return prev_cpu; - return affine; + schedstat_inc(sd->ttwu_move_affine); + schedstat_inc(p->se.statistics.nr_wakeups_affine); + return target; } static inline unsigned long task_util(struct task_struct *p); @@ -6355,8 +6355,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (cpu == prev_cpu) goto pick_cpu; - if (wake_affine(affine_sd, p, prev_cpu, sync)) - new_cpu = cpu; + new_cpu = wake_affine(affine_sd, p, prev_cpu, sync); } if (sd && !(sd_flag & SD_BALANCE_FORK)) { -- cgit From 806486c377e33ab662de6d47902e9e2a32b79368 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 30 Jan 2018 10:45:54 +0000 Subject: sched/fair: Do not migrate if the prev_cpu is idle wake_affine_idle() prefers to move a task to the current CPU if the wakeup is due to an interrupt. The expectation is that the interrupt data is cache hot and relevant to the waking task as well as avoiding a search. However, there is no way to determine if there was cache hot data on the previous CPU that may exceed the interrupt data. Furthermore, round-robin delivery of interrupts can migrate tasks around a socket where each CPU is under-utilised. This can interact badly with cpufreq which makes decisions based on per-cpu data. It has been observed on machines with HWP that p-states are not boosted to their maximum levels even though the workload is latency and throughput sensitive. This patch uses the previous CPU for the task if it's idle and cache-affine with the current CPU even if the current CPU is idle due to the wakup being related to the interrupt. This reduces migrations at the cost of the interrupt data not being cache hot when the task wakes. A variety of workloads were tested on various machines and no adverse impact was noticed that was outside noise. dbench on ext4 on UMA showed roughly 10% reduction in the number of CPU migrations and it is a case where interrupts are frequent for IO competions. In most cases, the difference in performance is quite small but variability is often reduced. For example, this is the result for pgbench running on a UMA machine with different numbers of clients. 4.15.0-rc9 4.15.0-rc9 baseline waprev-v1 Hmean 1 22096.28 ( 0.00%) 22734.86 ( 2.89%) Hmean 4 74633.42 ( 0.00%) 75496.77 ( 1.16%) Hmean 7 115017.50 ( 0.00%) 113030.81 ( -1.73%) Hmean 12 126209.63 ( 0.00%) 126613.40 ( 0.32%) Hmean 16 131886.91 ( 0.00%) 130844.35 ( -0.79%) Stddev 1 636.38 ( 0.00%) 417.11 ( 34.46%) Stddev 4 614.64 ( 0.00%) 583.24 ( 5.11%) Stddev 7 542.46 ( 0.00%) 435.45 ( 19.73%) Stddev 12 173.93 ( 0.00%) 171.50 ( 1.40%) Stddev 16 671.42 ( 0.00%) 680.30 ( -1.32%) CoeffVar 1 2.88 ( 0.00%) 1.83 ( 36.26%) Note that the different in performance is marginal but for low utilisation, there is less variability. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Matt Fleming Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180130104555.4125-4-mgorman@techsingularity.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4c400d79f1e5..db45b3554682 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5700,9 +5700,15 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) * context. Only allow the move if cache is shared. Otherwise an * interrupt intensive workload could force all tasks onto one * node depending on the IO topology or IRQ affinity settings. + * + * If the prev_cpu is idle and cache affine then avoid a migration. + * There is no guarantee that the cache hot data from an interrupt + * is more important than cache hot data on the prev_cpu and from + * a cpufreq perspective, it's better to have higher utilisation + * on one CPU. */ if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) - return this_cpu; + return idle_cpu(prev_cpu) ? prev_cpu : this_cpu; if (sync && cpu_rq(this_cpu)->nr_running == 1) return this_cpu; -- cgit From 32e839dda3ba576943365f0f5817ce5c843137dc Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 30 Jan 2018 10:45:55 +0000 Subject: sched/fair: Use a recently used CPU as an idle candidate and the basis for SIS The select_idle_sibling() (SIS) rewrite in commit: 10e2f1acd010 ("sched/core: Rewrite and improve select_idle_siblings()") ... replaced a domain iteration with a search that broadly speaking does a wrapped walk of the scheduler domain sharing a last-level-cache. While this had a number of improvements, one consequence is that two tasks that share a waker/wakee relationship push each other around a socket. Even though two tasks may be active, all cores are evenly used. This is great from a search perspective and spreads a load across individual cores, but it has adverse consequences for cpufreq. As each CPU has relatively low utilisation, cpufreq may decide the utilisation is too low to used a higher P-state and overall computation throughput suffers. While individual cpufreq and cpuidle drivers may compensate by artifically boosting P-state (at c0) or avoiding lower C-states (during idle), it does not help if hardware-based cpufreq (e.g. HWP) is used. This patch tracks a recently used CPU based on what CPU a task was running on when it last was a waker a CPU it was recently using when a task is a wakee. During SIS, the recently used CPU is used as a target if it's still allowed by the task and is idle. The benefit may be non-obvious so consider an example of two tasks communicating back and forth. Task A may be an application doing IO where task B is a kworker or kthread like journald. Task A may issue IO, wake B and B wakes up A on completion. With the existing scheme this may look like the following (potentially different IDs if SMT is in use but similar principal applies). A (cpu 0) wake B (wakes on cpu 1) B (cpu 1) wake A (wakes on cpu 2) A (cpu 2) wake B (wakes on cpu 3) etc. A careful reader may wonder why CPU 0 was not idle when B wakes A the first time and it's simply due to the fact that A can be rescheduled to another CPU and the pattern is that prev == target when B tries to wakeup A and the information about CPU 0 has been lost. With this patch, the pattern is more likely to be: A (cpu 0) wake B (wakes on cpu 1) B (cpu 1) wake A (wakes on cpu 0) A (cpu 0) wake B (wakes on cpu 1) etc i.e. two communicating casts are more likely to use just two cores instead of all available cores sharing a LLC. The most dramatic speedup was noticed on dbench using the XFS filesystem on UMA as clients interact heavily with workqueues in that configuration. Note that a similar speedup is not observed on ext4 as the wakeup pattern is different: 4.15.0-rc9 4.15.0-rc9 waprev-v1 biasancestor-v1 Hmean 1 287.54 ( 0.00%) 817.01 ( 184.14%) Hmean 2 1268.12 ( 0.00%) 1781.24 ( 40.46%) Hmean 4 1739.68 ( 0.00%) 1594.47 ( -8.35%) Hmean 8 2464.12 ( 0.00%) 2479.56 ( 0.63%) Hmean 64 1455.57 ( 0.00%) 1434.68 ( -1.44%) The results can be less dramatic on NUMA where automatic balancing interferes with the test. It's also known that network benchmarks running on localhost also benefit quite a bit from this patch (roughly 10% on netperf RR for UDP and TCP depending on the machine). Hackbench also seens small improvements (6-11% depending on machine and thread count). The facebook schbench was also tested but in most cases showed little or no different to wakeup latencies. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Matt Fleming Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180130104555.4125-5-mgorman@techsingularity.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 + kernel/sched/fair.c | 22 ++++++++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b40540e68104..36f113ac6353 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2461,6 +2461,7 @@ void wake_up_new_task(struct task_struct *p) * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, * as we're not fully set-up yet. */ + p->recent_used_cpu = task_cpu(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index db45b3554682..5eb3ffc9be84 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6197,7 +6197,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; - int i; + int i, recent_used_cpu; if (idle_cpu(target)) return target; @@ -6208,6 +6208,21 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) return prev; + /* Check a recently used CPU as a potential idle candidate */ + recent_used_cpu = p->recent_used_cpu; + if (recent_used_cpu != prev && + recent_used_cpu != target && + cpus_share_cache(recent_used_cpu, target) && + idle_cpu(recent_used_cpu) && + cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { + /* + * Replace recent_used_cpu with prev as it is a potential + * candidate for the next wake. + */ + p->recent_used_cpu = prev; + return recent_used_cpu; + } + sd = rcu_dereference(per_cpu(sd_llc, target)); if (!sd) return target; @@ -6375,9 +6390,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (!sd) { pick_cpu: - if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ + if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); + if (want_affine) + current->recent_used_cpu = cpu; + } } else { new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); } -- cgit