From 3e777f9909483b603946685d88acfae89f31b07b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 28 Feb 2017 15:50:30 -0500 Subject: sched/rt: Add comments describing the RT IPI pull method While looking into optimizations for the RT scheduler IPI logic, I realized that the comments are lacking to describe it efficiently. It deserves a lengthy description describing its design. Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Clark Williams Cc: Daniel Bristot de Oliveira Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170228155030.30c69068@gandalf.local.home [ Small typographical edits. ] Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 9f3e40226dec..979b7341008a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1927,6 +1927,87 @@ static int find_next_push_cpu(struct rq *rq) #define RT_PUSH_IPI_EXECUTING 1 #define RT_PUSH_IPI_RESTART 2 +/* + * When a high priority task schedules out from a CPU and a lower priority + * task is scheduled in, a check is made to see if there's any RT tasks + * on other CPUs that are waiting to run because a higher priority RT task + * is currently running on its CPU. In this case, the CPU with multiple RT + * tasks queued on it (overloaded) needs to be notified that a CPU has opened + * up that may be able to run one of its non-running queued RT tasks. + * + * On large CPU boxes, there's the case that several CPUs could schedule + * a lower priority task at the same time, in which case it will look for + * any overloaded CPUs that it could pull a task from. To do this, the runqueue + * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting + * for a single overloaded CPU's runqueue lock can produce a large latency. + * (This has actually been observed on large boxes running cyclictest). + * Instead of taking the runqueue lock of the overloaded CPU, each of the + * CPUs that scheduled a lower priority task simply sends an IPI to the + * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with + * lots of contention. The overloaded CPU will look to push its non-running + * RT task off, and if it does, it can then ignore the other IPIs coming + * in, and just pass those IPIs off to any other overloaded CPU. + * + * When a CPU schedules a lower priority task, it only sends an IPI to + * the "next" CPU that has overloaded RT tasks. This prevents IPI storms, + * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with + * RT overloaded tasks, would cause 100 IPIs to go out at once. + * + * The overloaded RT CPU, when receiving an IPI, will try to push off its + * overloaded RT tasks and then send an IPI to the next CPU that has + * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks + * have completed. Just because a CPU may have pushed off its own overloaded + * RT task does not mean it should stop sending the IPI around to other + * overloaded CPUs. There may be another RT task waiting to run on one of + * those CPUs that are of higher priority than the one that was just + * pushed. + * + * An optimization that could possibly be made is to make a CPU array similar + * to the cpupri array mask of all running RT tasks, but for the overloaded + * case, then the IPI could be sent to only the CPU with the highest priority + * RT task waiting, and that CPU could send off further IPIs to the CPU with + * the next highest waiting task. Since the overloaded case is much less likely + * to happen, the complexity of this implementation may not be worth it. + * Instead, just send an IPI around to all overloaded CPUs. + * + * The rq->rt.push_flags holds the status of the IPI that is going around. + * A run queue can only send out a single IPI at a time. The possible flags + * for rq->rt.push_flags are: + * + * (None or zero): No IPI is going around for the current rq + * RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around + * RT_PUSH_IPI_RESTART: The priority of the running task for the rq + * has changed, and the IPI should restart + * circulating the overloaded CPUs again. + * + * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated + * before sending to the next CPU. + * + * Instead of having all CPUs that schedule a lower priority task send + * an IPI to the same "first" CPU in the RT overload mask, they send it + * to the next overloaded CPU after their own CPU. This helps distribute + * the work when there's more than one overloaded CPU and multiple CPUs + * scheduling in lower priority tasks. + * + * When a rq schedules a lower priority task than what was currently + * running, the next CPU with overloaded RT tasks is examined first. + * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower + * priority task, it will send an IPI first to CPU 5, then CPU 5 will + * send to CPU 1 if it is still overloaded. CPU 1 will clear the + * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set. + * + * The first CPU to notice IPI_RESTART is set, will clear that flag and then + * send an IPI to the next overloaded CPU after the rq->cpu and not the next + * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3 + * schedules a lower priority task, and the IPI_RESTART gets set while the + * handling is being done on CPU 5, it will clear the flag and send it back to + * CPU 4 instead of CPU 1. + * + * Note, the above logic can be disabled by turning off the sched_feature + * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be + * taken by the CPU requesting a pull and the waiting RT task will be pulled + * by that CPU. This may be fine for machines with few CPUs. + */ static void tell_cpu_to_push(struct rq *rq) { int cpu; -- cgit From 26ae58d23b94a075ae724fd18783a3773131cfbc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Oct 2016 16:53:49 +0200 Subject: sched/core: Add WARNING for multiple update_rq_clock() calls Now that we have no missing calls, add a warning to find multiple calls. By having only a single update_rq_clock() call per rq-lock section, the section appears 'atomic' wrt time. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +++ kernel/sched/features.h | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3b31fc05a0f1..1bd15d0d0307 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -233,8 +233,11 @@ void update_rq_clock(struct rq *rq) return; #ifdef CONFIG_SCHED_DEBUG + if (sched_feat(WARN_DOUBLE_CLOCK)) + SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED); rq->clock_update_flags |= RQCF_UPDATED; #endif + delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; if (delta < 0) return; diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 1b3c8189b286..11192e0cb122 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -56,6 +56,13 @@ SCHED_FEAT(TTWU_QUEUE, true) */ SCHED_FEAT(SIS_AVG_CPU, false) +/* + * Issue a WARN when we do multiple update_rq_clock() calls + * in a single rq->lock section. Default disabled because the + * annotations are not complete. + */ +SCHED_FEAT(WARN_DOUBLE_CLOCK, false) + #ifdef HAVE_RT_PUSH_IPI /* * In order to avoid a thundering herd attack of CPUs that are -- cgit From 8a8c69c32778865affcedc2111bb5d938b50516f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Oct 2016 16:04:35 +0200 Subject: sched/core: Add rq->lock wrappers The missing update_rq_clock() check can work with partial rq->lock wrappery, since a missing wrapper can cause the warning to not be emitted when it should have, but cannot cause the warning to trigger when it should not have. The duplicate update_rq_clock() check however can cause false warnings to trigger. Therefore add more comprehensive rq->lock wrappery. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 155 ++++++++++++++++++++++++--------------------------- kernel/sched/fair.c | 71 ++++++++++++----------- kernel/sched/sched.h | 57 +++++++++++++++++++ 3 files changed, 171 insertions(+), 112 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1bd15d0d0307..c5a514b1668d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -85,21 +85,6 @@ int sysctl_sched_rt_runtime = 950000; /* CPUs with isolated domains */ cpumask_var_t cpu_isolated_map; -/* - * this_rq_lock - lock this runqueue and disable interrupts. - */ -static struct rq *this_rq_lock(void) - __acquires(rq->lock) -{ - struct rq *rq; - - local_irq_disable(); - rq = this_rq(); - raw_spin_lock(&rq->lock); - - return rq; -} - /* * __task_rq_lock - lock the rq @p resides on. */ @@ -264,13 +249,14 @@ static void hrtick_clear(struct rq *rq) static enum hrtimer_restart hrtick(struct hrtimer *timer) { struct rq *rq = container_of(timer, struct rq, hrtick_timer); + struct rq_flags rf; WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); update_rq_clock(rq); rq->curr->sched_class->task_tick(rq, rq->curr, 1); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); return HRTIMER_NORESTART; } @@ -290,11 +276,12 @@ static void __hrtick_restart(struct rq *rq) static void __hrtick_start(void *arg) { struct rq *rq = arg; + struct rq_flags rf; - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); __hrtick_restart(rq); rq->hrtick_csd_pending = 0; - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } /* @@ -949,18 +936,19 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * * Returns (locked) new rq. Old rq's lock is released. */ -static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu) +static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, + struct task_struct *p, int new_cpu) { lockdep_assert_held(&rq->lock); p->on_rq = TASK_ON_RQ_MIGRATING; dequeue_task(rq, p, 0); set_task_cpu(p, new_cpu); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, rf); rq = cpu_rq(new_cpu); - raw_spin_lock(&rq->lock); + rq_lock(rq, rf); BUG_ON(task_cpu(p) != new_cpu); enqueue_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; @@ -983,7 +971,8 @@ struct migration_arg { * So we race with normal scheduler movements, but that's OK, as long * as the task is no longer on this CPU. */ -static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu) +static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, + struct task_struct *p, int dest_cpu) { if (unlikely(!cpu_active(dest_cpu))) return rq; @@ -992,7 +981,7 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_ if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) return rq; - rq = move_queued_task(rq, p, dest_cpu); + rq = move_queued_task(rq, rf, p, dest_cpu); return rq; } @@ -1007,6 +996,7 @@ static int migration_cpu_stop(void *data) struct migration_arg *arg = data; struct task_struct *p = arg->task; struct rq *rq = this_rq(); + struct rq_flags rf; /* * The original target CPU might have gone down and we might @@ -1021,7 +1011,7 @@ static int migration_cpu_stop(void *data) sched_ttwu_pending(); raw_spin_lock(&p->pi_lock); - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); /* * If task_rq(p) != rq, it cannot be migrated here, because we're * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because @@ -1029,11 +1019,11 @@ static int migration_cpu_stop(void *data) */ if (task_rq(p) == rq) { if (task_on_rq_queued(p)) - rq = __migrate_task(rq, p, arg->dest_cpu); + rq = __migrate_task(rq, &rf, p, arg->dest_cpu); else p->wake_cpu = arg->dest_cpu; } - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); raw_spin_unlock(&p->pi_lock); local_irq_enable(); @@ -1153,9 +1143,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, * OK, since we're going to drop the lock immediately * afterwards anyway. */ - rq_unpin_lock(rq, &rf); - rq = move_queued_task(rq, p, dest_cpu); - rq_repin_lock(rq, &rf); + rq = move_queued_task(rq, &rf, p, dest_cpu); } out: task_rq_unlock(rq, p, &rf); @@ -1220,16 +1208,24 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) { if (task_on_rq_queued(p)) { struct rq *src_rq, *dst_rq; + struct rq_flags srf, drf; src_rq = task_rq(p); dst_rq = cpu_rq(cpu); + rq_pin_lock(src_rq, &srf); + rq_pin_lock(dst_rq, &drf); + p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); set_task_cpu(p, cpu); activate_task(dst_rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(dst_rq, p, 0); + + rq_unpin_lock(dst_rq, &drf); + rq_unpin_lock(src_rq, &srf); + } else { /* * Task isn't running anymore; make it appear like we migrated @@ -1729,14 +1725,12 @@ void sched_ttwu_pending(void) struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); struct task_struct *p; - unsigned long flags; struct rq_flags rf; if (!llist) return; - raw_spin_lock_irqsave(&rq->lock, flags); - rq_pin_lock(rq, &rf); + rq_lock_irqsave(rq, &rf); while (llist) { int wake_flags = 0; @@ -1750,8 +1744,7 @@ void sched_ttwu_pending(void) ttwu_do_activate(rq, p, wake_flags, &rf); } - rq_unpin_lock(rq, &rf); - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } void scheduler_ipi(void) @@ -1809,7 +1802,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) void wake_up_if_idle(int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long flags; + struct rq_flags rf; rcu_read_lock(); @@ -1819,11 +1812,11 @@ void wake_up_if_idle(int cpu) if (set_nr_if_polling(rq->idle)) { trace_sched_wake_idle_without_ipi(cpu); } else { - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); if (is_idle_task(rq->curr)) smp_send_reschedule(cpu); /* Else CPU is not idle, do nothing here: */ - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } out: @@ -1849,11 +1842,9 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) } #endif - raw_spin_lock(&rq->lock); - rq_pin_lock(rq, &rf); + rq_lock(rq, &rf); ttwu_do_activate(rq, p, wake_flags, &rf); - rq_unpin_lock(rq, &rf); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } /* @@ -2100,11 +2091,9 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) * disabled avoiding further scheduler activity on it and we've * not yet picked a replacement task. */ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, rf); raw_spin_lock(&p->pi_lock); - raw_spin_lock(&rq->lock); - rq_repin_lock(rq, rf); + rq_relock(rq, rf); } if (!(p->state & TASK_NORMAL)) @@ -2778,9 +2767,9 @@ static void __balance_callback(struct rq *rq) { struct callback_head *head, *next; void (*func)(struct rq *rq); - unsigned long flags; + struct rq_flags rf; - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); head = rq->balance_callback; rq->balance_callback = NULL; while (head) { @@ -2791,7 +2780,7 @@ static void __balance_callback(struct rq *rq) func(rq); } - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } static inline void balance_callback(struct rq *rq) @@ -3096,15 +3085,18 @@ void scheduler_tick(void) int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; + struct rq_flags rf; sched_clock_tick(); - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); + update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); cpu_load_update_active(rq); calc_global_load_tick(rq); - raw_spin_unlock(&rq->lock); + + rq_unlock(rq, &rf); perf_event_task_tick(); @@ -3389,8 +3381,7 @@ static void __sched notrace __schedule(bool preempt) * done by the caller to avoid the race with signal_wake_up(). */ smp_mb__before_spinlock(); - raw_spin_lock(&rq->lock); - rq_pin_lock(rq, &rf); + rq_lock(rq, &rf); /* Promote REQ to ACT */ rq->clock_update_flags <<= 1; @@ -3442,8 +3433,7 @@ static void __sched notrace __schedule(bool preempt) rq = context_switch(rq, prev, next, &rf); } else { rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - rq_unpin_lock(rq, &rf); - raw_spin_unlock_irq(&rq->lock); + rq_unlock_irq(rq, &rf); } balance_callback(rq); @@ -4926,7 +4916,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, */ SYSCALL_DEFINE0(sched_yield) { - struct rq *rq = this_rq_lock(); + struct rq_flags rf; + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + rq_lock(rq, &rf); schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); @@ -4935,9 +4930,8 @@ SYSCALL_DEFINE0(sched_yield) * Since we are going to call schedule() anyway, there's * no need to preempt or enable interrupts: */ - __release(rq->lock); - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); - do_raw_spin_unlock(&rq->lock); + preempt_disable(); + rq_unlock(rq, &rf); sched_preempt_enable_no_resched(); schedule(); @@ -5582,11 +5576,11 @@ static struct task_struct fake_task = { * there's no concurrency possible, we hold the required locks anyway * because of lock validation efforts. */ -static void migrate_tasks(struct rq *dead_rq) +static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; - struct rq_flags rf; + struct rq_flags orf = *rf; int dest_cpu; /* @@ -5605,9 +5599,7 @@ static void migrate_tasks(struct rq *dead_rq) * class method both need to have an up-to-date * value of rq->clock[_task] */ - rq_pin_lock(rq, &rf); update_rq_clock(rq); - rq_unpin_lock(rq, &rf); for (;;) { /* @@ -5620,8 +5612,7 @@ static void migrate_tasks(struct rq *dead_rq) /* * pick_next_task() assumes pinned rq->lock: */ - rq_repin_lock(rq, &rf); - next = pick_next_task(rq, &fake_task, &rf); + next = pick_next_task(rq, &fake_task, rf); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); @@ -5634,10 +5625,9 @@ static void migrate_tasks(struct rq *dead_rq) * because !cpu_active at this point, which means load-balance * will not interfere. Also, stop-machine. */ - rq_unpin_lock(rq, &rf); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, rf); raw_spin_lock(&next->pi_lock); - raw_spin_lock(&rq->lock); + rq_relock(rq, rf); /* * Since we're inside stop-machine, _nothing_ should have @@ -5651,12 +5641,12 @@ static void migrate_tasks(struct rq *dead_rq) /* Find suitable destination for @next, with force if needed. */ dest_cpu = select_fallback_rq(dead_rq->cpu, next); - - rq = __migrate_task(rq, next, dest_cpu); + rq = __migrate_task(rq, rf, next, dest_cpu); if (rq != dead_rq) { - raw_spin_unlock(&rq->lock); + rq_unlock(rq, rf); rq = dead_rq; - raw_spin_lock(&rq->lock); + *rf = orf; + rq_relock(rq, rf); } raw_spin_unlock(&next->pi_lock); } @@ -5769,7 +5759,7 @@ static int cpuset_cpu_inactive(unsigned int cpu) int sched_cpu_activate(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long flags; + struct rq_flags rf; set_cpu_active(cpu, true); @@ -5787,12 +5777,12 @@ int sched_cpu_activate(unsigned int cpu) * 2) At runtime, if cpuset_cpu_active() fails to rebuild the * domains. */ - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_online(rq); } - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); update_max_interval(); @@ -5850,18 +5840,20 @@ int sched_cpu_starting(unsigned int cpu) int sched_cpu_dying(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long flags; + struct rq_flags rf; /* Handle pending wakeups and then migrate everything off */ sched_ttwu_pending(); - raw_spin_lock_irqsave(&rq->lock, flags); + + rq_lock_irqsave(rq, &rf); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - migrate_tasks(rq); + migrate_tasks(rq, &rf); BUG_ON(rq->nr_running != 1); - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); + calc_load_migrate(rq); update_max_interval(); nohz_balance_exit_idle(cpu); @@ -7011,14 +7003,15 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) for_each_online_cpu(i) { struct cfs_rq *cfs_rq = tg->cfs_rq[i]; struct rq *rq = cfs_rq->rq; + struct rq_flags rf; - raw_spin_lock_irq(&rq->lock); + rq_lock_irq(rq, &rf); cfs_rq->runtime_enabled = runtime_enabled; cfs_rq->runtime_remaining = 0; if (cfs_rq->throttled) unthrottle_cfs_rq(cfs_rq); - raw_spin_unlock_irq(&rq->lock); + rq_unlock_irq(rq, &rf); } if (runtime_was_enabled && !runtime_enabled) cfs_bandwidth_usage_dec(); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dea138964b91..72b081b9a249 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4271,8 +4271,9 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, throttled_list) { struct rq *rq = rq_of(cfs_rq); + struct rq_flags rf; - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); if (!cfs_rq_throttled(cfs_rq)) goto next; @@ -4289,7 +4290,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, unthrottle_cfs_rq(cfs_rq); next: - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); if (!remaining) break; @@ -5097,15 +5098,16 @@ void cpu_load_update_nohz_stop(void) unsigned long curr_jiffies = READ_ONCE(jiffies); struct rq *this_rq = this_rq(); unsigned long load; + struct rq_flags rf; if (curr_jiffies == this_rq->last_load_update_tick) return; load = weighted_cpuload(cpu_of(this_rq)); - raw_spin_lock(&this_rq->lock); + rq_lock(this_rq, &rf); update_rq_clock(this_rq); cpu_load_update_nohz(this_rq, curr_jiffies, load); - raw_spin_unlock(&this_rq->lock); + rq_unlock(this_rq, &rf); } #else /* !CONFIG_NO_HZ_COMMON */ static inline void cpu_load_update_nohz(struct rq *this_rq, @@ -6913,9 +6915,11 @@ static void attach_task(struct rq *rq, struct task_struct *p) */ static void attach_one_task(struct rq *rq, struct task_struct *p) { - raw_spin_lock(&rq->lock); + struct rq_flags rf; + + rq_lock(rq, &rf); attach_task(rq, p); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } /* @@ -6926,8 +6930,9 @@ static void attach_tasks(struct lb_env *env) { struct list_head *tasks = &env->tasks; struct task_struct *p; + struct rq_flags rf; - raw_spin_lock(&env->dst_rq->lock); + rq_lock(env->dst_rq, &rf); while (!list_empty(tasks)) { p = list_first_entry(tasks, struct task_struct, se.group_node); @@ -6936,7 +6941,7 @@ static void attach_tasks(struct lb_env *env) attach_task(env->dst_rq, p); } - raw_spin_unlock(&env->dst_rq->lock); + rq_unlock(env->dst_rq, &rf); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -6944,9 +6949,9 @@ static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq; - unsigned long flags; + struct rq_flags rf; - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); /* @@ -6965,7 +6970,7 @@ static void update_blocked_averages(int cpu) if (cfs_rq->tg->se[cpu]) update_load_avg(cfs_rq->tg->se[cpu], 0); } - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } /* @@ -7019,12 +7024,12 @@ static inline void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq = &rq->cfs; - unsigned long flags; + struct rq_flags rf; - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } static unsigned long task_h_load(struct task_struct *p) @@ -8042,7 +8047,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd_parent = sd->parent; struct sched_group *group; struct rq *busiest; - unsigned long flags; + struct rq_flags rf; struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); struct lb_env env = { @@ -8105,7 +8110,7 @@ redo: env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: - raw_spin_lock_irqsave(&busiest->lock, flags); + rq_lock_irqsave(busiest, &rf); update_rq_clock(busiest); /* @@ -8122,14 +8127,14 @@ more_balance: * See task_rq_lock() family for the details. */ - raw_spin_unlock(&busiest->lock); + rq_unlock(busiest, &rf); if (cur_ld_moved) { attach_tasks(&env); ld_moved += cur_ld_moved; } - local_irq_restore(flags); + local_irq_restore(rf.flags); if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; @@ -8207,6 +8212,8 @@ more_balance: sd->nr_balance_failed++; if (need_active_balance(&env)) { + unsigned long flags; + raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the active_load_balance_cpu_stop, @@ -8444,8 +8451,9 @@ static int active_load_balance_cpu_stop(void *data) struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; struct task_struct *p = NULL; + struct rq_flags rf; - raw_spin_lock_irq(&busiest_rq->lock); + rq_lock_irq(busiest_rq, &rf); /* make sure the requested cpu hasn't gone down in the meantime */ if (unlikely(busiest_cpu != smp_processor_id() || @@ -8496,7 +8504,7 @@ static int active_load_balance_cpu_stop(void *data) rcu_read_unlock(); out_unlock: busiest_rq->active_balance = 0; - raw_spin_unlock(&busiest_rq->lock); + rq_unlock(busiest_rq, &rf); if (p) attach_one_task(target_rq, p); @@ -8794,10 +8802,13 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) * do the balance. */ if (time_after_eq(jiffies, rq->next_balance)) { - raw_spin_lock_irq(&rq->lock); + struct rq_flags rf; + + rq_lock_irq(rq, &rf); update_rq_clock(rq); cpu_load_update_idle(rq); - raw_spin_unlock_irq(&rq->lock); + rq_unlock_irq(rq, &rf); + rebalance_domains(rq, CPU_IDLE); } @@ -8988,8 +8999,9 @@ static void task_fork_fair(struct task_struct *p) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se, *curr; struct rq *rq = this_rq(); + struct rq_flags rf; - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); update_rq_clock(rq); cfs_rq = task_cfs_rq(current); @@ -9010,7 +9022,7 @@ static void task_fork_fair(struct task_struct *p) } se->vruntime -= cfs_rq->min_vruntime; - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } /* @@ -9372,7 +9384,6 @@ static DEFINE_MUTEX(shares_mutex); int sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; - unsigned long flags; /* * We can't change the weight of the root cgroup. @@ -9389,19 +9400,17 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) tg->shares = shares; for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); - struct sched_entity *se; + struct sched_entity *se = tg->se[i]; + struct rq_flags rf; - se = tg->se[i]; /* Propagate contribution to hierarchy */ - raw_spin_lock_irqsave(&rq->lock, flags); - - /* Possible calls to update_curr() need rq clock */ + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); for_each_sched_entity(se) { update_load_avg(se, UPDATE_TG); update_cfs_shares(se); } - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } done: diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5cbf92214ad8..7d4f69329634 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1624,6 +1624,7 @@ static inline void sched_avg_update(struct rq *rq) { } struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(rq->lock); + struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(p->pi_lock) __acquires(rq->lock); @@ -1645,6 +1646,62 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } +static inline void +rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irqsave(&rq->lock, rf->flags); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock_irq(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irq(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_relock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_repin_lock(rq, rf); +} + +static inline void +rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irqrestore(&rq->lock, rf->flags); +} + +static inline void +rq_unlock_irq(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irq(&rq->lock); +} + +static inline void +rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); +} + #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT -- cgit From 0a67d1ee30ef1efe6a412b3590e08734902aed43 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Oct 2016 16:29:45 +0200 Subject: sched/core: Add {EN,DE}QUEUE_NOCLOCK flags Currently {en,de}queue_task() do an unconditional update_rq_clock(). However since we want to avoid duplicate updates, so that each rq->lock section appears atomic in time, we need to be able to skip these clock updates. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 ++++++++-- kernel/sched/sched.h | 8 +++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c5a514b1668d..ce363bdc7e6b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -752,17 +752,23 @@ static void set_load_weight(struct task_struct *p) static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { - update_rq_clock(rq); + if (!(flags & ENQUEUE_NOCLOCK)) + update_rq_clock(rq); + if (!(flags & ENQUEUE_RESTORE)) sched_info_queued(rq, p); + p->sched_class->enqueue_task(rq, p, flags); } static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { - update_rq_clock(rq); + if (!(flags & DEQUEUE_NOCLOCK)) + update_rq_clock(rq); + if (!(flags & DEQUEUE_SAVE)) sched_info_dequeued(rq, p); + p->sched_class->dequeue_task(rq, p, flags); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7d4f69329634..de4b934ba974 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1331,15 +1331,17 @@ extern const u32 sched_prio_to_wmult[40]; #define DEQUEUE_SLEEP 0x01 #define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ #define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ +#define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */ #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 #define ENQUEUE_MOVE 0x04 +#define ENQUEUE_NOCLOCK 0x08 -#define ENQUEUE_HEAD 0x08 -#define ENQUEUE_REPLENISH 0x10 +#define ENQUEUE_HEAD 0x10 +#define ENQUEUE_REPLENISH 0x20 #ifdef CONFIG_SMP -#define ENQUEUE_MIGRATED 0x20 +#define ENQUEUE_MIGRATED 0x40 #else #define ENQUEUE_MIGRATED 0x00 #endif -- cgit From 7134b3e941613dcb959b4b178cc4a35e45cbbc0d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Feb 2017 14:23:38 +0100 Subject: sched/core: Add ENQUEUE_NOCLOCK to ENQUEUE_RESTORE In all cases, ENQUEUE_RESTORE should also have ENQUEUE_NOCLOCK because DEQUEUE_SAVE will have done an update_rq_clock(). Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ce363bdc7e6b..247d0a0c319e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1070,7 +1070,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) p->sched_class->set_cpus_allowed(p, new_mask); if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE); + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); if (running) set_curr_task(rq, p); } @@ -3815,7 +3815,7 @@ void set_user_nice(struct task_struct *p, long nice) delta = p->prio - old_prio; if (queued) { - enqueue_task(rq, p, ENQUEUE_RESTORE); + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -5517,7 +5517,7 @@ void sched_setnuma(struct task_struct *p, int nid) p->numa_preferred_nid = nid; if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE); + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); if (running) set_curr_task(rq, p); task_rq_unlock(rq, p, &rf); @@ -6431,7 +6431,7 @@ void sched_move_task(struct task_struct *tsk) sched_change_group(tsk, TASK_MOVE_GROUP); if (queued) - enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); + enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE | ENQUEUE_NOCLOCK); if (running) set_curr_task(rq, tsk); -- cgit From 77558e4d01ac0c7fa8cb1af4a61c2ab508d79f30 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Feb 2017 14:36:23 +0100 Subject: sched/core: Make sched_ttwu_pending() atomic in time Since all tasks on the wake_list are woken under a single rq->lock avoid calling update_rq_clock() for each task. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 247d0a0c319e..dead90d680fd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1685,7 +1685,7 @@ static void ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, struct rq_flags *rf) { - int en_flags = ENQUEUE_WAKEUP; + int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; lockdep_assert_held(&rq->lock); @@ -1737,6 +1737,7 @@ void sched_ttwu_pending(void) return; rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); while (llist) { int wake_flags = 0; @@ -1849,6 +1850,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) #endif rq_lock(rq, &rf); + update_rq_clock(rq); ttwu_do_activate(rq, p, wake_flags, &rf); rq_unlock(rq, &rf); } -- cgit From bce4dc80c66ad355c74e876c82ce371020754627 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Feb 2017 14:40:35 +0100 Subject: sched/core: Simplify update_rq_clock() in __schedule() Instead of relying on deactivate_task() to call update_rq_clock() and handling the case where it didn't happen (task_on_rq_queued), unconditionally do update_rq_clock() and skip any further updates. This also avoids a double update on deactivate_task() + ttwu_local(). Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dead90d680fd..179a6c928bf1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2114,7 +2114,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) delayacct_blkio_end(); atomic_dec(&rq->nr_iowait); } - ttwu_activate(rq, p, ENQUEUE_WAKEUP); + ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); } ttwu_do_wakeup(rq, p, 0, rf); @@ -3393,13 +3393,14 @@ static void __sched notrace __schedule(bool preempt) /* Promote REQ to ACT */ rq->clock_update_flags <<= 1; + update_rq_clock(rq); switch_count = &prev->nivcsw; if (!preempt && prev->state) { if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; } else { - deactivate_task(rq, prev, DEQUEUE_SLEEP); + deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); prev->on_rq = 0; if (prev->in_iowait) { @@ -3423,9 +3424,6 @@ static void __sched notrace __schedule(bool preempt) switch_count = &prev->nvcsw; } - if (task_on_rq_queued(prev)) - update_rq_clock(rq); - next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); clear_preempt_need_resched(); -- cgit From 7a57f32a4d5c80c7790929dd7f4441bb6bff7480 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Feb 2017 14:47:02 +0100 Subject: sched/core: Avoid obvious double update_rq_clock() Add DEQUEUE_NOCLOCK to all places where we just did an update_rq_clock() already. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 179a6c928bf1..c6be770d6e68 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1062,7 +1062,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) * holding rq->lock. */ lockdep_assert_held(&rq->lock); - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); } if (running) put_prev_task(rq, p); @@ -2555,7 +2555,7 @@ void wake_up_new_task(struct task_struct *p) update_rq_clock(rq); post_init_entity_util_avg(&p->se); - activate_task(rq, p, 0); + activate_task(rq, p, ENQUEUE_NOCLOCK); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); @@ -3683,7 +3683,8 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; + int oldprio, queued, running, queue_flag = + DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *prev_class; struct rq_flags rf; struct rq *rq; @@ -3804,7 +3805,7 @@ void set_user_nice(struct task_struct *p, long nice) queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); if (running) put_prev_task(rq, p); @@ -4125,7 +4126,7 @@ static int __sched_setscheduler(struct task_struct *p, const struct sched_class *prev_class; struct rq_flags rf; int reset_on_fork; - int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; + int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct rq *rq; /* May grab non-irq protected spin_locks: */ @@ -6413,7 +6414,8 @@ static void sched_change_group(struct task_struct *tsk, int type) */ void sched_move_task(struct task_struct *tsk) { - int queued, running; + int queued, running, queue_flags = + DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct rq_flags rf; struct rq *rq; @@ -6424,14 +6426,14 @@ void sched_move_task(struct task_struct *tsk) queued = task_on_rq_queued(tsk); if (queued) - dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); + dequeue_task(rq, tsk, queue_flags); if (running) put_prev_task(rq, tsk); sched_change_group(tsk, TASK_MOVE_GROUP); if (queued) - enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE | ENQUEUE_NOCLOCK); + enqueue_task(rq, tsk, queue_flags); if (running) set_curr_task(rq, tsk); -- cgit From 5704ac0ae7f59581a264f45ddfc0ab4235aa052a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Feb 2017 17:15:21 +0100 Subject: sched/core: Fix double update_rq_clock) calls in attach_task()/detach_task() Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 72b081b9a249..2805bd7c8994 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6771,7 +6771,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env) lockdep_assert_held(&env->src_rq->lock); p->on_rq = TASK_ON_RQ_MIGRATING; - deactivate_task(env->src_rq, p, 0); + deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); } @@ -6904,7 +6904,7 @@ static void attach_task(struct rq *rq, struct task_struct *p) lockdep_assert_held(&rq->lock); BUG_ON(task_rq(p) != rq); - activate_task(rq, p, 0); + activate_task(rq, p, ENQUEUE_NOCLOCK); p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); } @@ -6918,6 +6918,7 @@ static void attach_one_task(struct rq *rq, struct task_struct *p) struct rq_flags rf; rq_lock(rq, &rf); + update_rq_clock(rq); attach_task(rq, p); rq_unlock(rq, &rf); } @@ -6933,6 +6934,7 @@ static void attach_tasks(struct lb_env *env) struct rq_flags rf; rq_lock(env->dst_rq, &rf); + update_rq_clock(env->dst_rq); while (!list_empty(tasks)) { p = list_first_entry(tasks, struct task_struct, se.group_node); -- cgit From 15ff991e8047561bb4a4e800ec60f60939be5fd4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 5 Oct 2016 17:59:32 +0200 Subject: sched/core: Avoid double update_rq_clock() in move_queued_task() Address this case: WARNING: CPU: 0 PID: 2070 at ../kernel/sched/core.c:109 update_rq_clock+0x74/0x80 rq->clock_update_flags & RQCF_UPDATED Call Trace: update_rq_clock() move_queued_task() __set_cpus_allowed_ptr() ... Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c6be770d6e68..c762f627b9f2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -948,7 +948,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, lockdep_assert_held(&rq->lock); p->on_rq = TASK_ON_RQ_MIGRATING; - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, new_cpu); rq_unlock(rq, rf); @@ -987,6 +987,7 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) return rq; + update_rq_clock(rq); rq = move_queued_task(rq, rf, p, dest_cpu); return rq; -- cgit From d7921a5ddab8d30e06e321f37eec629f23797486 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 16 Mar 2017 19:45:19 -0700 Subject: sched/core: Fix rq lock pinning warning after call balance callbacks This can be reproduced by running rt-migrate-test: WARNING: CPU: 2 PID: 2195 at kernel/locking/lockdep.c:3670 lock_unpin_lock() unpinning an unpinned lock ... Call Trace: dump_stack() __warn() warn_slowpath_fmt() lock_unpin_lock() __balance_callback() __schedule() schedule() futex_wait_queue_me() futex_wait() do_futex() SyS_futex() do_syscall_64() entry_SYSCALL64_slow_path() Revert the rq_lock_irqsave() usage here, the whole point of the balance_callback() was to allow dropping rq->lock. Reported-by: Fengguang Wu Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 8a8c69c32778 ("sched/core: Add rq->lock wrappers") Link: http://lkml.kernel.org/r/1489718719-3951-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c762f627b9f2..ab9f6ac099a7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2776,9 +2776,9 @@ static void __balance_callback(struct rq *rq) { struct callback_head *head, *next; void (*func)(struct rq *rq); - struct rq_flags rf; + unsigned long flags; - rq_lock_irqsave(rq, &rf); + raw_spin_lock_irqsave(&rq->lock, flags); head = rq->balance_callback; rq->balance_callback = NULL; while (head) { @@ -2789,7 +2789,7 @@ static void __balance_callback(struct rq *rq) func(rq); } - rq_unlock_irqrestore(rq, &rf); + raw_spin_unlock_irqrestore(&rq->lock, flags); } static inline void balance_callback(struct rq *rq) -- cgit From bc4278987e3874da62edf585fe8b3bdd9b53f638 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 17 Mar 2017 14:47:22 +0100 Subject: sched/fair: Fix FTQ noise bench regression A regression of the FTQ noise has been reported by Ying Huang, on the following hardware: 8 threads Intel(R) Core(TM)i7-4770 CPU @ 3.40GHz with 8G memory ... which was caused by this commit: commit 4e5160766fcc ("sched/fair: Propagate asynchrous detach") The only part of the patch that can increase the noise is the update of blocked load of group entity in update_blocked_averages(). We can optimize this call and skip the update of group entity if its load and utilization are already null and there is no pending propagation of load in the task group. This optimization partly restores the noise score. A more agressive optimization has been tried but has shown worse score. Reported-by: ying.huang@linux.intel.com Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: ying.huang@intel.com Fixes: 4e5160766fcc ("sched/fair: Propagate asynchrous detach") Link: http://lkml.kernel.org/r/1489758442-2877-1-git-send-email-vincent.guittot@linaro.org [ Fixed typos, improved layout. ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2805bd7c8994..03adf9fb48b1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3173,6 +3173,36 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) return 1; } +/* + * Check if we need to update the load and the utilization of a blocked + * group_entity: + */ +static inline bool skip_blocked_update(struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + + /* + * If sched_entity still have not zero load or utilization, we have to + * decay it: + */ + if (se->avg.load_avg || se->avg.util_avg) + return false; + + /* + * If there is a pending propagation, we have to update the load and + * the utilization of the sched_entity: + */ + if (gcfs_rq->propagate_avg) + return false; + + /* + * Otherwise, the load and the utilization of the sched_entity is + * already zero and there is no pending propagation, so it will be a + * waste of time to try to decay it: + */ + return true; +} + #else /* CONFIG_FAIR_GROUP_SCHED */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} @@ -6961,6 +6991,8 @@ static void update_blocked_averages(int cpu) * list_add_leaf_cfs_rq() for details. */ for_each_leaf_cfs_rq(rq, cfs_rq) { + struct sched_entity *se; + /* throttled entities do not contribute to load */ if (throttled_hierarchy(cfs_rq)) continue; @@ -6968,9 +7000,10 @@ static void update_blocked_averages(int cpu) if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) update_tg_load_avg(cfs_rq, 0); - /* Propagate pending load changes to the parent */ - if (cfs_rq->tg->se[cpu]) - update_load_avg(cfs_rq->tg->se[cpu], 0); + /* Propagate pending load changes to the parent, if any: */ + se = cfs_rq->tg->se[cpu]; + if (se && !skip_blocked_update(se)) + update_load_avg(se, 0); } rq_unlock_irqrestore(rq, &rf); } -- cgit From 05b40e057734811ce452344fb3690d09965a7b6a Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 22 Mar 2017 23:27:50 +0530 Subject: sched/fair: Prefer sibiling only if local group is under-utilized If the child domain prefers tasks to go siblings, the local group could end up pulling tasks to itself even if the local group is almost equally loaded as the source group. Lets assume a 4 core,smt==2 machine running 5 thread ebizzy workload. Everytime, local group has capacity and source group has atleast 2 threads, local group tries to pull the task. This causes the threads to constantly move between different cores. This is even more profound if the cores have more threads, like in Power 8, smt 8 mode. Fix this by only allowing local group to pull a task, if the source group has more number of tasks than the local group. Here are the relevant perf stat numbers of a 22 core,smt 8 Power 8 machine. Without patch: Performance counter stats for 'ebizzy -t 22 -S 100' (5 runs): 1,440 context-switches # 0.001 K/sec ( +- 1.26% ) 366 cpu-migrations # 0.000 K/sec ( +- 5.58% ) 3,933 page-faults # 0.002 K/sec ( +- 11.08% ) Performance counter stats for 'ebizzy -t 48 -S 100' (5 runs): 6,287 context-switches # 0.001 K/sec ( +- 3.65% ) 3,776 cpu-migrations # 0.001 K/sec ( +- 4.84% ) 5,702 page-faults # 0.001 K/sec ( +- 9.36% ) Performance counter stats for 'ebizzy -t 96 -S 100' (5 runs): 8,776 context-switches # 0.001 K/sec ( +- 0.73% ) 2,790 cpu-migrations # 0.000 K/sec ( +- 0.98% ) 10,540 page-faults # 0.001 K/sec ( +- 3.12% ) With patch: Performance counter stats for 'ebizzy -t 22 -S 100' (5 runs): 1,133 context-switches # 0.001 K/sec ( +- 4.72% ) 123 cpu-migrations # 0.000 K/sec ( +- 3.42% ) 3,858 page-faults # 0.002 K/sec ( +- 8.52% ) Performance counter stats for 'ebizzy -t 48 -S 100' (5 runs): 2,169 context-switches # 0.000 K/sec ( +- 6.19% ) 189 cpu-migrations # 0.000 K/sec ( +- 12.75% ) 5,917 page-faults # 0.001 K/sec ( +- 8.09% ) Performance counter stats for 'ebizzy -t 96 -S 100' (5 runs): 5,333 context-switches # 0.001 K/sec ( +- 5.91% ) 506 cpu-migrations # 0.000 K/sec ( +- 3.35% ) 10,792 page-faults # 0.001 K/sec ( +- 7.75% ) Which show that in these workloads CPU migrations get reduced significantly. Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Link: http://lkml.kernel.org/r/1490205470-10249-1-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 03adf9fb48b1..31453d57e8f5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7565,6 +7565,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; + struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; bool overload = false; @@ -7581,7 +7582,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); if (local_group) { sds->local = sg; - sgs = &sds->local_stat; + sgs = local; if (env->idle != CPU_NEWLY_IDLE || time_after_eq(jiffies, sg->sgc->next_update)) @@ -7605,8 +7606,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd * the tasks on the system). */ if (prefer_sibling && sds->local && - group_has_capacity(env, &sds->local_stat) && - (sgs->sum_nr_running > 1)) { + group_has_capacity(env, local) && + (sgs->sum_nr_running > local->sum_nr_running + 1)) { sgs->group_no_capacity = 1; sgs->group_type = group_classify(sg, sgs); } -- cgit From 0ccb977f4c80b921a8bf6a2c4b8ea0c1fed6553c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 28 Mar 2017 11:08:20 +0200 Subject: sched/fair: Explicitly generate __update_load_avg() instances The __update_load_avg() function is an __always_inline because its used with constant propagation to generate different variants of the code without having to duplicate it (which would be prone to bugs). Explicitly instantiate the 3 variants. Note that most of this is called from rather hot paths, so reducing branches is good. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 56 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 31453d57e8f5..2ac00cfbf29f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2849,7 +2849,7 @@ static u32 __compute_runnable_contrib(u64 n) * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ static __always_inline int -__update_load_avg(u64 now, int cpu, struct sched_avg *sa, +___update_load_avg(u64 now, int cpu, struct sched_avg *sa, unsigned long weight, int running, struct cfs_rq *cfs_rq) { u64 delta, scaled_delta, periods; @@ -2953,6 +2953,28 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, return decayed; } +static int +__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) +{ + return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL); +} + +static int +__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return ___update_load_avg(now, cpu, &se->avg, + se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); +} + +static int +__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) +{ + return ___update_load_avg(now, cpu, &cfs_rq->avg, + scale_load_down(cfs_rq->load.weight), + cfs_rq->curr != NULL, cfs_rq); +} + /* * Signed add and clamp on underflow. * @@ -3014,6 +3036,9 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next) { + u64 p_last_update_time; + u64 n_last_update_time; + if (!sched_feat(ATTACH_AGE_LOAD)) return; @@ -3024,11 +3049,11 @@ void set_task_rq_fair(struct sched_entity *se, * time. This will result in the wakee task is less decayed, but giving * the wakee more load sounds not bad. */ - if (se->avg.last_update_time && prev) { - u64 p_last_update_time; - u64 n_last_update_time; + if (!(se->avg.last_update_time && prev)) + return; #ifndef CONFIG_64BIT + { u64 p_last_update_time_copy; u64 n_last_update_time_copy; @@ -3043,14 +3068,13 @@ void set_task_rq_fair(struct sched_entity *se, } while (p_last_update_time != p_last_update_time_copy || n_last_update_time != n_last_update_time_copy); + } #else - p_last_update_time = prev->avg.last_update_time; - n_last_update_time = next->avg.last_update_time; + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; #endif - __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)), - &se->avg, 0, 0, NULL); - se->avg.last_update_time = n_last_update_time; - } + __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se); + se->avg.last_update_time = n_last_update_time; } /* Take into account change of utilization of a child task group */ @@ -3295,8 +3319,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) set_tg_cfs_propagate(cfs_rq); } - decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, - scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq); + decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); #ifndef CONFIG_64BIT smp_wmb(); @@ -3328,11 +3351,8 @@ static inline void update_load_avg(struct sched_entity *se, int flags) * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calc in migration */ - if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) { - __update_load_avg(now, cpu, &se->avg, - se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); - } + if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) + __update_load_avg_se(now, cpu, cfs_rq, se); decayed = update_cfs_rq_load_avg(now, cfs_rq, true); decayed |= propagate_entity_load_avg(se); @@ -3437,7 +3457,7 @@ void sync_entity_load_avg(struct sched_entity *se) u64 last_update_time; last_update_time = cfs_rq_last_update_time(cfs_rq); - __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se); } /* -- cgit From a481db34b9beb7a9647c23f2320dd38a2b1d681f Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 13 Feb 2017 05:44:23 +0800 Subject: sched/fair: Optimize ___update_sched_avg() The main PELT function ___update_load_avg(), which implements the accumulation and progression of the geometric average series, is implemented along the following lines for the scenario where the time delta spans all 3 possible sections (see figure below): 1. add the remainder of the last incomplete period 2. decay old sum 3. accumulate new sum in full periods since last_update_time 4. accumulate the current incomplete period 5. update averages Or: d1 d2 d3 ^ ^ ^ | | | |<->|<----------------->|<--->| ... |---x---|------| ... |------|-----x (now) load_sum' = (load_sum + weight * scale * d1) * y^(p+1) + (1,2) p weight * scale * 1024 * \Sum y^n + (3) n=1 weight * scale * d3 * y^0 (4) load_avg' = load_sum' / LOAD_AVG_MAX (5) Where: d1 - is the delta part completing the remainder of the last incomplete period, d2 - is the delta part spannind complete periods, and d3 - is the delta part starting the current incomplete period. We can simplify the code in two steps; the first step is to separate the first term into new and old parts like: (load_sum + weight * scale * d1) * y^(p+1) = load_sum * y^(p+1) + weight * scale * d1 * y^(p+1) Once we've done that, its easy to see that all new terms carry the common factors: weight * scale If we factor those out, we arrive at the form: load_sum' = load_sum * y^(p+1) + weight * scale * (d1 * y^(p+1) + p 1024 * \Sum y^n + n=1 d3 * y^0) Which results in a simpler, smaller and faster implementation. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: matt@codeblueprint.co.uk Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: umgwanakikbuti@gmail.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1486935863-25251-3-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 212 +++++++++++++++++++++++++++++----------------------- 1 file changed, 118 insertions(+), 94 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2ac00cfbf29f..76f67b3e34d6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2767,7 +2767,7 @@ static const u32 __accumulated_sum_N32[] = { * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) */ -static __always_inline u64 decay_load(u64 val, u64 n) +static u64 decay_load(u64 val, u64 n) { unsigned int local_n; @@ -2795,31 +2795,112 @@ static __always_inline u64 decay_load(u64 val, u64 n) return val; } -/* - * For updates fully spanning n periods, the contribution to runnable - * average will be: \Sum 1024*y^n - * - * We can compute this reasonably efficiently by combining: - * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n = LOAD_AVG_MAX_N)) + if (!periods) + return remainder - period_contrib; + + if (unlikely(periods >= LOAD_AVG_MAX_N)) return LOAD_AVG_MAX; - /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */ - contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD]; - n %= LOAD_AVG_PERIOD; - contrib = decay_load(contrib, n); - return contrib + runnable_avg_yN_sum[n]; + /* + * c1 = d1 y^(p+1) + */ + c1 = decay_load((u64)(1024 - period_contrib), periods); + + periods -= 1; + /* + * For updates fully spanning n periods, the contribution to runnable + * average will be: + * + * c2 = 1024 \Sum y^n + * + * We can compute this reasonably efficiently by combining: + * + * y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD} + */ + if (likely(periods <= LOAD_AVG_PERIOD)) { + c2 = runnable_avg_yN_sum[periods]; + } else { + c2 = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD]; + periods %= LOAD_AVG_PERIOD; + c2 = decay_load(c2, periods); + c2 += runnable_avg_yN_sum[periods]; + } + + return c1 + c2 + c3; } #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) +/* + * Accumulate the three separate parts of the sum; d1 the remainder + * of the last (incomplete) period, d2 the span of full periods and d3 + * the remainder of the (incomplete) current period. + * + * d1 d2 d3 + * ^ ^ ^ + * | | | + * |<->|<----------------->|<--->| + * ... |---x---|------| ... |------|-----x (now) + * + * p + * u' = (u + d1) y^(p+1) + 1024 \Sum y^n + d3 y^0 + * n=1 + * + * = u y^(p+1) + (Step 1) + * + * p + * d1 y^(p+1) + 1024 \Sum y^n + d3 y^0 (Step 2) + * n=1 + */ +static __always_inline u32 +accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, + unsigned long weight, int running, struct cfs_rq *cfs_rq) +{ + unsigned long scale_freq, scale_cpu; + u64 periods; + u32 contrib; + + scale_freq = arch_scale_freq_capacity(NULL, cpu); + scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + + delta += sa->period_contrib; + periods = delta / 1024; /* A period is 1024us (~1ms) */ + + /* + * Step 1: decay old *_sum if we crossed period boundaries. + */ + if (periods) { + sa->load_sum = decay_load(sa->load_sum, periods); + if (cfs_rq) { + cfs_rq->runnable_load_sum = + decay_load(cfs_rq->runnable_load_sum, periods); + } + sa->util_sum = decay_load((u64)(sa->util_sum), periods); + } + + /* + * Step 2 + */ + delta %= 1024; + contrib = __accumulate_sum(periods, sa->period_contrib, delta); + sa->period_contrib = delta; + + contrib = cap_scale(contrib, scale_freq); + if (weight) { + sa->load_sum += weight * contrib; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * contrib; + } + if (running) + sa->util_sum += contrib * scale_cpu; + + return periods; +} + /* * We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable @@ -2852,10 +2933,7 @@ static __always_inline int ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, unsigned long weight, int running, struct cfs_rq *cfs_rq) { - u64 delta, scaled_delta, periods; - u32 contrib; - unsigned int delta_w, scaled_delta_w, decayed = 0; - unsigned long scale_freq, scale_cpu; + u64 delta; delta = now - sa->last_update_time; /* @@ -2876,81 +2954,27 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, return 0; sa->last_update_time = now; - scale_freq = arch_scale_freq_capacity(NULL, cpu); - scale_cpu = arch_scale_cpu_capacity(NULL, cpu); - - /* delta_w is the amount already accumulated against our next period */ - delta_w = sa->period_contrib; - if (delta + delta_w >= 1024) { - decayed = 1; - - /* how much left for next period will start over, we don't know yet */ - sa->period_contrib = 0; - - /* - * Now that we know we're crossing a period boundary, figure - * out how much from delta we need to complete the current - * period and accrue it. - */ - delta_w = 1024 - delta_w; - scaled_delta_w = cap_scale(delta_w, scale_freq); - if (weight) { - sa->load_sum += weight * scaled_delta_w; - if (cfs_rq) { - cfs_rq->runnable_load_sum += - weight * scaled_delta_w; - } - } - if (running) - sa->util_sum += scaled_delta_w * scale_cpu; - - delta -= delta_w; - - /* Figure out how many additional periods this update spans */ - periods = delta / 1024; - delta %= 1024; - - sa->load_sum = decay_load(sa->load_sum, periods + 1); - if (cfs_rq) { - cfs_rq->runnable_load_sum = - decay_load(cfs_rq->runnable_load_sum, periods + 1); - } - sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1); - - /* Efficiently calculate \sum (1..n_period) 1024*y^i */ - contrib = __compute_runnable_contrib(periods); - contrib = cap_scale(contrib, scale_freq); - if (weight) { - sa->load_sum += weight * contrib; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * contrib; - } - if (running) - sa->util_sum += contrib * scale_cpu; - } - - /* Remainder of delta accrued against u_0` */ - scaled_delta = cap_scale(delta, scale_freq); - if (weight) { - sa->load_sum += weight * scaled_delta; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * scaled_delta; - } - if (running) - sa->util_sum += scaled_delta * scale_cpu; - - sa->period_contrib += delta; + /* + * Now we know we crossed measurement unit boundaries. The *_avg + * accrues by two steps: + * + * Step 1: accumulate *_sum since last_update_time. If we haven't + * crossed period boundaries, finish. + */ + if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq)) + return 0; - if (decayed) { - sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); - if (cfs_rq) { - cfs_rq->runnable_load_avg = - div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); - } - sa->util_avg = sa->util_sum / LOAD_AVG_MAX; + /* + * Step 2: update *_avg. + */ + sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); + if (cfs_rq) { + cfs_rq->runnable_load_avg = + div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); } + sa->util_avg = sa->util_sum / LOAD_AVG_MAX; - return decayed; + return 1; } static int -- cgit From 717a94b5fc7092afebe9c93791f29b2d8e5d297a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 7 Apr 2017 10:03:26 +1000 Subject: sched/core: Remove 'task' parameter and rename tsk_restore_flags() to current_restore_flags() It is not safe for one thread to modify the ->flags of another thread as there is no locking that can protect the update. So tsk_restore_flags(), which takes a task pointer and modifies the flags, is an invitation to do the wrong thing. All current users pass "current" as the task, so no developers have accepted that invitation. It would be best to ensure it remains that way. So rename tsk_restore_flags() to current_restore_flags() and don't pass in a task_struct pointer. Always operate on current->flags. Signed-off-by: NeilBrown Cc: Linus Torvalds Cc: Mel Gorman Cc: Michal Hocko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- drivers/block/nbd.c | 2 +- drivers/scsi/iscsi_tcp.c | 2 +- fs/nfsd/vfs.c | 2 +- include/linux/sched.h | 6 +++--- kernel/softirq.c | 2 +- net/core/dev.c | 2 +- net/core/sock.c | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index d8a23561b4cb..3c9052bf2327 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -244,7 +244,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send, *sent += result; } while (msg_data_left(&msg)); - tsk_restore_flags(current, pflags, PF_MEMALLOC); + current_restore_flags(pflags, PF_MEMALLOC); return result; } diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c index 4228aba1f654..bbea8eac9abb 100644 --- a/drivers/scsi/iscsi_tcp.c +++ b/drivers/scsi/iscsi_tcp.c @@ -387,7 +387,7 @@ static int iscsi_sw_tcp_pdu_xmit(struct iscsi_task *task) rc = 0; } - tsk_restore_flags(current, pflags, PF_MEMALLOC); + current_restore_flags(pflags, PF_MEMALLOC); return rc; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 19d50f600e8d..9aaf6ca77569 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1004,7 +1004,7 @@ out_nfserr: else err = nfserrno(host_err); if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) - tsk_restore_flags(current, pflags, PF_LESS_THROTTLE); + current_restore_flags(pflags, PF_LESS_THROTTLE); return err; } diff --git a/include/linux/sched.h b/include/linux/sched.h index d67eee84fd43..0978fb74e45a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1286,10 +1286,10 @@ TASK_PFA_TEST(LMK_WAITING, lmk_waiting) TASK_PFA_SET(LMK_WAITING, lmk_waiting) static inline void -tsk_restore_flags(struct task_struct *task, unsigned long orig_flags, unsigned long flags) +current_restore_flags(unsigned long orig_flags, unsigned long flags) { - task->flags &= ~flags; - task->flags |= orig_flags & flags; + current->flags &= ~flags; + current->flags |= orig_flags & flags; } extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); diff --git a/kernel/softirq.c b/kernel/softirq.c index 744fa611cae0..4e09821f9d9e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -309,7 +309,7 @@ restart: account_irq_exit_time(current); __local_bh_enable(SOFTIRQ_OFFSET); WARN_ON_ONCE(in_interrupt()); - tsk_restore_flags(current, old_flags, PF_MEMALLOC); + current_restore_flags(old_flags, PF_MEMALLOC); } asmlinkage __visible void do_softirq(void) diff --git a/net/core/dev.c b/net/core/dev.c index 7869ae3837ca..e8a366387a99 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4240,7 +4240,7 @@ static int __netif_receive_skb(struct sk_buff *skb) */ current->flags |= PF_MEMALLOC; ret = __netif_receive_skb_core(skb, true); - tsk_restore_flags(current, pflags, PF_MEMALLOC); + current_restore_flags(pflags, PF_MEMALLOC); } else ret = __netif_receive_skb_core(skb, false); diff --git a/net/core/sock.c b/net/core/sock.c index 2c4f574168fb..b416a537bd0a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -325,7 +325,7 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) current->flags |= PF_MEMALLOC; ret = sk->sk_backlog_rcv(sk, skb); - tsk_restore_flags(current, pflags, PF_MEMALLOC); + current_restore_flags(pflags, PF_MEMALLOC); return ret; } -- cgit From 05296e7535d67ba4926b543a09cf5d430a815cb6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 31 Mar 2017 10:51:41 +0200 Subject: sched/fair: Fix corner case in __accumulate_sum() Paul noticed that in the (periods >= LOAD_AVG_MAX_N) case in __accumulate_sum(), the returned contribution value (LOAD_AVG_MAX) is incorrect. This is because at this point, the decay_load() on the old state -- the first step in accumulate_sum() -- will not have resulted in 0, and will therefore result in a sum larger than the maximum value of our series. Obviously broken. Note that: decay_load(LOAD_AVG_MAX, LOAD_AVG_MAX_N) = 1 (345 / 32) 47742 * - ^ = ~27 2 Not to mention that any further contribution from the d3 segment (our new period) would also push it over the maximum. Solve this by noting that we can write our c2 term: p c2 = 1024 \Sum y^n n=1 In terms of our maximum value: inf inf p max = 1024 \Sum y^n = 1024 ( \Sum y^n + \Sum y^n + y^0 ) n=0 n=p+1 n=1 Further note that: inf inf inf ( \Sum y^n ) y^p = \Sum y^(n+p) = \Sum y^n n=0 n=0 n=p Combined that gives us: p c2 = 1024 \Sum y^n n=1 inf inf = 1024 ( \Sum y^n - \Sum y^n - y^0 ) n=0 n=p+1 = max - (max y^(p+1)) - 1024 Further simplify things by dealing with p=0 early on. Reported-by: Paul Turner Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yuyang Du Cc: linux-kernel@vger.kernel.org Fixes: a481db34b9be ("sched/fair: Optimize ___update_sched_avg()") Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 75 ++++++++++++++--------------------------------------- 1 file changed, 19 insertions(+), 56 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 76f67b3e34d6..1e5f58081762 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -727,7 +727,6 @@ static unsigned long task_h_load(struct task_struct *p); */ #define LOAD_AVG_PERIOD 32 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */ -#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */ /* Give new sched_entity start runnable values to heavy its load in infant time */ void init_entity_runnable_average(struct sched_entity *se) @@ -2743,26 +2742,6 @@ static const u32 runnable_avg_yN_inv[] = { 0x85aac367, 0x82cd8698, }; -/* - * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent - * over-estimates when re-combining. - */ -static const u32 runnable_avg_yN_sum[] = { - 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103, - 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082, - 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371, -}; - -/* - * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to - * lower integers. See Documentation/scheduler/sched-avg.txt how these - * were generated: - */ -static const u32 __accumulated_sum_N32[] = { - 0, 23371, 35056, 40899, 43820, 45281, - 46011, 46376, 46559, 46650, 46696, 46719, -}; - /* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) @@ -2771,9 +2750,7 @@ static u64 decay_load(u64 val, u64 n) { unsigned int local_n; - if (!n) - return val; - else if (unlikely(n > LOAD_AVG_PERIOD * 63)) + if (unlikely(n > LOAD_AVG_PERIOD * 63)) return 0; /* after bounds checking we can collapse to 32-bit */ @@ -2795,40 +2772,25 @@ static u64 decay_load(u64 val, u64 n) return val; } -static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder) +static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) { - u32 c1, c2, c3 = remainder; /* y^0 == 1 */ - - if (!periods) - return remainder - period_contrib; - - if (unlikely(periods >= LOAD_AVG_MAX_N)) - return LOAD_AVG_MAX; + u32 c1, c2, c3 = d3; /* y^0 == 1 */ /* * c1 = d1 y^(p+1) */ - c1 = decay_load((u64)(1024 - period_contrib), periods); + c1 = decay_load((u64)d1, periods); - periods -= 1; /* - * For updates fully spanning n periods, the contribution to runnable - * average will be: + * p + * c2 = 1024 \Sum y^n + * n=1 * - * c2 = 1024 \Sum y^n - * - * We can compute this reasonably efficiently by combining: - * - * y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD} + * inf inf + * = 1024 ( \Sum y^n - \Sum y^n - y^0 ) + * n=0 n=p+1 */ - if (likely(periods <= LOAD_AVG_PERIOD)) { - c2 = runnable_avg_yN_sum[periods]; - } else { - c2 = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD]; - periods %= LOAD_AVG_PERIOD; - c2 = decay_load(c2, periods); - c2 += runnable_avg_yN_sum[periods]; - } + c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024; return c1 + c2 + c3; } @@ -2861,8 +2823,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, unsigned long weight, int running, struct cfs_rq *cfs_rq) { unsigned long scale_freq, scale_cpu; + u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ u64 periods; - u32 contrib; scale_freq = arch_scale_freq_capacity(NULL, cpu); scale_cpu = arch_scale_cpu_capacity(NULL, cpu); @@ -2880,13 +2842,14 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, decay_load(cfs_rq->runnable_load_sum, periods); } sa->util_sum = decay_load((u64)(sa->util_sum), periods); - } - /* - * Step 2 - */ - delta %= 1024; - contrib = __accumulate_sum(periods, sa->period_contrib, delta); + /* + * Step 2 + */ + delta %= 1024; + contrib = __accumulate_pelt_segments(periods, + 1024 - sa->period_contrib, delta); + } sa->period_contrib = delta; contrib = cap_scale(contrib, scale_freq); -- cgit From 76d034edcf658e3c74fd90b149882ab1464e4af9 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 13 Feb 2017 05:44:22 +0800 Subject: sched/Documentation: Add 'sched-pelt' tool Add a user-space program to compute/generate the PELT constants. The kernel/sched/sched-pelt.h header will contain the output of this program. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: matt@codeblueprint.co.uk Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: umgwanakikbuti@gmail.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1486935863-25251-2-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar --- Documentation/scheduler/sched-pelt.c | 108 +++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 Documentation/scheduler/sched-pelt.c diff --git a/Documentation/scheduler/sched-pelt.c b/Documentation/scheduler/sched-pelt.c new file mode 100644 index 000000000000..e4219139386a --- /dev/null +++ b/Documentation/scheduler/sched-pelt.c @@ -0,0 +1,108 @@ +/* + * The following program is used to generate the constants for + * computing sched averages. + * + * ============================================================== + * C program (compile with -lm) + * ============================================================== + */ + +#include +#include + +#define HALFLIFE 32 +#define SHIFT 32 + +double y; + +void calc_runnable_avg_yN_inv(void) +{ + int i; + unsigned int x; + + printf("static const u32 runnable_avg_yN_inv[] = {"); + for (i = 0; i < HALFLIFE; i++) { + x = ((1UL<<32)-1)*pow(y, i); + + if (i % 6 == 0) printf("\n\t"); + printf("0x%8x, ", x); + } + printf("\n};\n\n"); +} + +int sum = 1024; + +void calc_runnable_avg_yN_sum(void) +{ + int i; + + printf("static const u32 runnable_avg_yN_sum[] = {\n\t 0,"); + for (i = 1; i <= HALFLIFE; i++) { + if (i == 1) + sum *= y; + else + sum = sum*y + 1024*y; + + if (i % 11 == 0) + printf("\n\t"); + + printf("%5d,", sum); + } + printf("\n};\n\n"); +} + +int n = -1; +/* first period */ +long max = 1024; + +void calc_converged_max(void) +{ + long last = 0, y_inv = ((1UL<<32)-1)*y; + + for (; ; n++) { + if (n > -1) + max = ((max*y_inv)>>SHIFT) + 1024; + /* + * This is the same as: + * max = max*y + 1024; + */ + + if (last == max) + break; + + last = max; + } + n--; + printf("#define LOAD_AVG_PERIOD %d\n", HALFLIFE); + printf("#define LOAD_AVG_MAX %ld\n", max); +// printf("#define LOAD_AVG_MAX_N %d\n\n", n); +} + +void calc_accumulated_sum_32(void) +{ + int i, x = sum; + + printf("static const u32 __accumulated_sum_N32[] = {\n\t 0,"); + for (i = 1; i <= n/HALFLIFE+1; i++) { + if (i > 1) + x = x/2 + sum; + + if (i % 6 == 0) + printf("\n\t"); + + printf("%6d,", x); + } + printf("\n};\n\n"); +} + +void main(void) +{ + printf("/* Generated by Documentation/scheduler/sched-pelt; do not modify. */\n\n"); + + y = pow(0.5, 1/(double)HALFLIFE); + + calc_runnable_avg_yN_inv(); +// calc_runnable_avg_yN_sum(); + calc_converged_max(); +// calc_accumulated_sum_32(); +} -- cgit From 3841cdc31099fe3b84c93903c63e3d60348c0ea1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Apr 2017 12:47:33 +0200 Subject: sched/fair: Fix comments Historically our periods (or p) argument in PELT denoted the number of full periods (what is now d2). However recent patches have changed this to the total decay (previously p+1), leading to a confusing discrepancy between comments and code. Try and clarify things by making periods (in code) and p (in comments) be the same thing (again). Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1e5f58081762..d43e9ac9c3c5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2777,18 +2777,18 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) u32 c1, c2, c3 = d3; /* y^0 == 1 */ /* - * c1 = d1 y^(p+1) + * c1 = d1 y^p */ c1 = decay_load((u64)d1, periods); /* - * p + * p-1 * c2 = 1024 \Sum y^n * n=1 * * inf inf * = 1024 ( \Sum y^n - \Sum y^n - y^0 ) - * n=0 n=p+1 + * n=0 n=p */ c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024; @@ -2808,15 +2808,15 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) * |<->|<----------------->|<--->| * ... |---x---|------| ... |------|-----x (now) * - * p - * u' = (u + d1) y^(p+1) + 1024 \Sum y^n + d3 y^0 - * n=1 + * p-1 + * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0 + * n=1 * - * = u y^(p+1) + (Step 1) + * = u y^p + (Step 1) * - * p - * d1 y^(p+1) + 1024 \Sum y^n + d3 y^0 (Step 2) - * n=1 + * p-1 + * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2) + * n=1 */ static __always_inline u32 accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, -- cgit From bb0bd044e65c2bf0f26b29613fcc441dfdeedf14 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Apr 2017 13:20:45 +0200 Subject: sched/fair: Increase PELT accuracy for small tasks We truncate (and loose) the lower 10 bits of runtime in ___update_load_avg(), this means there's a consistent bias to under-account tasks. This is esp. significant for small tasks. Cure this by only forwarding last_update_time to the point we've actually accounted for, leaving the remainder for the next time. Reported-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Acked-by: Morten Rasmussen Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d43e9ac9c3c5..1e3b99a9ab69 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2915,7 +2915,8 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, delta >>= 10; if (!delta) return 0; - sa->last_update_time = now; + + sa->last_update_time += delta << 10; /* * Now we know we crossed measurement unit boundaries. The *_avg -- cgit From 283e2ed3990c36c00403b62b264ebfabaf931104 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 11 Apr 2017 11:08:42 +0200 Subject: sched/fair: Move the PELT constants into a generated header Now that we have a tool to generate the PELT constants in C form, use its output as a separate header. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 21 +++------------------ kernel/sched/sched-pelt.h | 13 +++++++++++++ 2 files changed, 16 insertions(+), 18 deletions(-) create mode 100644 kernel/sched/sched-pelt.h diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1e3b99a9ab69..a903276fcb62 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -717,17 +717,12 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP + +#include "sched-pelt.h" + static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); -/* - * We choose a half-life close to 1 scheduling period. - * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are - * dependent on this value. - */ -#define LOAD_AVG_PERIOD 32 -#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ - /* Give new sched_entity start runnable values to heavy its load in infant time */ void init_entity_runnable_average(struct sched_entity *se) { @@ -2732,16 +2727,6 @@ static inline void update_cfs_shares(struct sched_entity *se) #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP -/* Precomputed fixed inverse multiplies for multiplication by y^n */ -static const u32 runnable_avg_yN_inv[] = { - 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, - 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, - 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, - 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, - 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, - 0x85aac367, 0x82cd8698, -}; - /* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h new file mode 100644 index 000000000000..cd200d16529e --- /dev/null +++ b/kernel/sched/sched-pelt.h @@ -0,0 +1,13 @@ +/* Generated by Documentation/scheduler/sched-pelt; do not modify. */ + +static const u32 runnable_avg_yN_inv[] = { + 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, + 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, + 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, + 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, + 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, + 0x85aac367, 0x82cd8698, +}; + +#define LOAD_AVG_PERIOD 32 +#define LOAD_AVG_MAX 47742 -- cgit From 048c9b954e20396e0c45ee778466994d1be2e612 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 12 Apr 2017 22:07:27 +0200 Subject: ia64/topology: Remove cpus_allowed manipulation The CPU hotplug callback fiddles with the cpus_allowed pointer to pin the calling thread on the plugged CPU. That's already guaranteed by the hotplug core code. Remove it. Signed-off-by: Thomas Gleixner Cc: Fenghua Yu Cc: Tony Luck Cc: linux-ia64@vger.kernel.org Cc: Herbert Xu Cc: "Rafael J. Wysocki" Cc: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Sebastian Siewior Cc: Lai Jiangshan Cc: Viresh Kumar Cc: Michael Ellerman Cc: Tejun Heo Cc: "David S. Miller" Cc: Len Brown Link: http://lkml.kernel.org/r/20170412201042.174518069@linutronix.de Signed-off-by: Thomas Gleixner --- arch/ia64/kernel/topology.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c index 1a68f012a6dc..d76529cbff20 100644 --- a/arch/ia64/kernel/topology.c +++ b/arch/ia64/kernel/topology.c @@ -355,18 +355,12 @@ static int cache_add_dev(unsigned int cpu) unsigned long i, j; struct cache_info *this_object; int retval = 0; - cpumask_t oldmask; if (all_cpu_cache_info[cpu].kobj.parent) return 0; - oldmask = current->cpus_allowed; - retval = set_cpus_allowed_ptr(current, cpumask_of(cpu)); - if (unlikely(retval)) - return retval; retval = cpu_cache_sysfs_init(cpu); - set_cpus_allowed_ptr(current, &oldmask); if (unlikely(retval < 0)) return retval; -- cgit From 0e8d6a9336b487a1dd6f1991ff376e669d4c87c6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 12 Apr 2017 22:07:28 +0200 Subject: workqueue: Provide work_on_cpu_safe() work_on_cpu() is not protected against CPU hotplug. For code which requires to be either executed on an online CPU or to fail if the CPU is not available the callsite would have to protect against CPU hotplug. Provide a function which does get/put_online_cpus() around the call to work_on_cpu() and fails the call with -ENODEV if the target CPU is not online. Preparatory patch to convert several racy task affinity manipulations. Signed-off-by: Thomas Gleixner Acked-by: Tejun Heo Cc: Fenghua Yu Cc: Tony Luck Cc: Herbert Xu Cc: "Rafael J. Wysocki" Cc: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Sebastian Siewior Cc: Lai Jiangshan Cc: Viresh Kumar Cc: Michael Ellerman Cc: "David S. Miller" Cc: Len Brown Link: http://lkml.kernel.org/r/20170412201042.262610721@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/workqueue.h | 5 +++++ kernel/workqueue.c | 23 +++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index bde063cefd04..c102ef65cb64 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -608,8 +608,13 @@ static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg) { return fn(arg); } +static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) +{ + return fn(arg); +} #else long work_on_cpu(int cpu, long (*fn)(void *), void *arg); +long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c0168b7da1ea..5bf1be018628 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4735,6 +4735,29 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg) return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu); + +/** + * work_on_cpu_safe - run a function in thread context on a particular cpu + * @cpu: the cpu to run on + * @fn: the function to run + * @arg: the function argument + * + * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold + * any locks which would prevent @fn from completing. + * + * Return: The value @fn returns. + */ +long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) +{ + long ret = -ENODEV; + + get_online_cpus(); + if (cpu_online(cpu)) + ret = work_on_cpu(cpu, fn, arg); + put_online_cpus(); + return ret; +} +EXPORT_SYMBOL_GPL(work_on_cpu_safe); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER -- cgit From 67cb85fdcee7fbc61c09c00360d1a4ae37641db4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 12 Apr 2017 22:07:29 +0200 Subject: ia64/salinfo: Replace racy task affinity logic Some of the file operations in /proc/sal require to run code on the requested cpu. This is achieved by temporarily setting the affinity of the calling user space thread to the requested CPU and reset it to the original affinity afterwards. That's racy vs. CPU hotplug and concurrent affinity settings for that thread resulting in code executing on the wrong CPU and overwriting the new affinity setting. Replace it by using work_on_cpu_safe() which guarantees to run the code on the requested CPU or to fail in case the CPU is offline. Signed-off-by: Thomas Gleixner Cc: Fenghua Yu Cc: Tony Luck Cc: linux-ia64@vger.kernel.org Cc: Herbert Xu Cc: "Rafael J. Wysocki" Cc: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Sebastian Siewior Cc: Lai Jiangshan Cc: Viresh Kumar Cc: Michael Ellerman Cc: Tejun Heo Cc: "David S. Miller" Cc: Len Brown Link: http://lkml.kernel.org/r/20170412201042.341863457@linutronix.de Signed-off-by: Thomas Gleixner --- arch/ia64/kernel/salinfo.c | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c index d194d5c83d32..63dc9cdc95c5 100644 --- a/arch/ia64/kernel/salinfo.c +++ b/arch/ia64/kernel/salinfo.c @@ -179,14 +179,14 @@ struct salinfo_platform_oemdata_parms { const u8 *efi_guid; u8 **oemdata; u64 *oemdata_size; - int ret; }; -static void +static long salinfo_platform_oemdata_cpu(void *context) { struct salinfo_platform_oemdata_parms *parms = context; - parms->ret = salinfo_platform_oemdata(parms->efi_guid, parms->oemdata, parms->oemdata_size); + + return salinfo_platform_oemdata(parms->efi_guid, parms->oemdata, parms->oemdata_size); } static void @@ -380,16 +380,7 @@ salinfo_log_release(struct inode *inode, struct file *file) return 0; } -static void -call_on_cpu(int cpu, void (*fn)(void *), void *arg) -{ - cpumask_t save_cpus_allowed = current->cpus_allowed; - set_cpus_allowed_ptr(current, cpumask_of(cpu)); - (*fn)(arg); - set_cpus_allowed_ptr(current, &save_cpus_allowed); -} - -static void +static long salinfo_log_read_cpu(void *context) { struct salinfo_data *data = context; @@ -399,6 +390,7 @@ salinfo_log_read_cpu(void *context) /* Clear corrected errors as they are read from SAL */ if (rh->severity == sal_log_severity_corrected) ia64_sal_clear_state_info(data->type); + return 0; } static void @@ -430,7 +422,7 @@ retry: spin_unlock_irqrestore(&data_saved_lock, flags); if (!data->saved_num) - call_on_cpu(cpu, salinfo_log_read_cpu, data); + work_on_cpu_safe(cpu, salinfo_log_read_cpu, data); if (!data->log_size) { data->state = STATE_NO_DATA; cpumask_clear_cpu(cpu, &data->cpu_event); @@ -459,11 +451,13 @@ salinfo_log_read(struct file *file, char __user *buffer, size_t count, loff_t *p return simple_read_from_buffer(buffer, count, ppos, buf, bufsize); } -static void +static long salinfo_log_clear_cpu(void *context) { struct salinfo_data *data = context; + ia64_sal_clear_state_info(data->type); + return 0; } static int @@ -486,7 +480,7 @@ salinfo_log_clear(struct salinfo_data *data, int cpu) rh = (sal_log_record_header_t *)(data->log_buffer); /* Corrected errors have already been cleared from SAL */ if (rh->severity != sal_log_severity_corrected) - call_on_cpu(cpu, salinfo_log_clear_cpu, data); + work_on_cpu_safe(cpu, salinfo_log_clear_cpu, data); /* clearing a record may make a new record visible */ salinfo_log_new_read(cpu, data); if (data->state == STATE_LOG_RECORD) { @@ -531,9 +525,8 @@ salinfo_log_write(struct file *file, const char __user *buffer, size_t count, lo .oemdata = &data->oemdata, .oemdata_size = &data->oemdata_size }; - call_on_cpu(cpu, salinfo_platform_oemdata_cpu, &parms); - if (parms.ret) - count = parms.ret; + count = work_on_cpu_safe(cpu, salinfo_platform_oemdata_cpu, + &parms); } else data->oemdata_size = 0; } else -- cgit From 9feb42ac88b516e378b9782e82b651ca5bed95c4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 6 Apr 2017 14:56:18 +0200 Subject: ia64/sn/hwperf: Replace racy task affinity logic sn_hwperf_op_cpu() which is invoked from an ioctl requires to run code on the requested cpu. This is achieved by temporarily setting the affinity of the calling user space thread to the requested CPU and reset it to the original affinity afterwards. That's racy vs. CPU hotplug and concurrent affinity settings for that thread resulting in code executing on the wrong CPU and overwriting the new affinity setting. Replace it by using work_on_cpu_safe() which guarantees to run the code on the requested CPU or to fail in case the CPU is offline. Signed-off-by: Thomas Gleixner Cc: Fenghua Yu Cc: Tony Luck Cc: linux-ia64@vger.kernel.org Cc: Herbert Xu Cc: "Rafael J. Wysocki" Cc: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Sebastian Siewior Cc: Lai Jiangshan Cc: Viresh Kumar Cc: Michael Ellerman Cc: Tejun Heo Cc: "David S. Miller" Cc: Len Brown Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1704122251450.2548@nanos Signed-off-by: Thomas Gleixner --- arch/ia64/sn/kernel/sn2/sn_hwperf.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c index 52704f199dd6..55febd65911a 100644 --- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c +++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c @@ -598,12 +598,17 @@ static void sn_hwperf_call_sal(void *info) op_info->ret = r; } +static long sn_hwperf_call_sal_work(void *info) +{ + sn_hwperf_call_sal(info); + return 0; +} + static int sn_hwperf_op_cpu(struct sn_hwperf_op_info *op_info) { u32 cpu; u32 use_ipi; int r = 0; - cpumask_t save_allowed; cpu = (op_info->a->arg & SN_HWPERF_ARG_CPU_MASK) >> 32; use_ipi = op_info->a->arg & SN_HWPERF_ARG_USE_IPI_MASK; @@ -629,13 +634,9 @@ static int sn_hwperf_op_cpu(struct sn_hwperf_op_info *op_info) /* use an interprocessor interrupt to call SAL */ smp_call_function_single(cpu, sn_hwperf_call_sal, op_info, 1); - } - else { - /* migrate the task before calling SAL */ - save_allowed = current->cpus_allowed; - set_cpus_allowed_ptr(current, cpumask_of(cpu)); - sn_hwperf_call_sal(op_info); - set_cpus_allowed_ptr(current, &save_allowed); + } else { + /* Call on the target CPU */ + work_on_cpu_safe(cpu, sn_hwperf_call_sal_work, op_info); } } r = op_info->ret; -- cgit From 6d11b87d55eb75007a3721c2de5938f5bbf607fb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 12 Apr 2017 22:07:31 +0200 Subject: powerpc/smp: Replace open coded task affinity logic Init task invokes smp_ops->setup_cpu() from smp_cpus_done(). Init task can run on any online CPU at this point, but the setup_cpu() callback requires to be invoked on the boot CPU. This is achieved by temporarily setting the affinity of the calling user space thread to the requested CPU and reset it to the original affinity afterwards. That's racy vs. CPU hotplug and concurrent affinity settings for that thread resulting in code executing on the wrong CPU and overwriting the new affinity setting. That's actually not a problem in this context as neither CPU hotplug nor affinity settings can happen, but the access to task_struct::cpus_allowed is about to restricted. Replace it with a call to work_on_cpu_safe() which achieves the same result. Signed-off-by: Thomas Gleixner Acked-by: Michael Ellerman Cc: Fenghua Yu Cc: Tony Luck Cc: Herbert Xu Cc: "Rafael J. Wysocki" Cc: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Sebastian Siewior Cc: Lai Jiangshan Cc: Viresh Kumar Cc: Tejun Heo Cc: Paul Mackerras Cc: linuxppc-dev@lists.ozlabs.org Cc: "David S. Miller" Cc: Len Brown Link: http://lkml.kernel.org/r/20170412201042.518053336@linutronix.de Signed-off-by: Thomas Gleixner --- arch/powerpc/kernel/smp.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 46f89e66a273..d68ed1f004a3 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -787,24 +787,21 @@ static struct sched_domain_topology_level powerpc_topology[] = { { NULL, }, }; -void __init smp_cpus_done(unsigned int max_cpus) +static __init long smp_setup_cpu_workfn(void *data __always_unused) { - cpumask_var_t old_mask; + smp_ops->setup_cpu(boot_cpuid); + return 0; +} - /* We want the setup_cpu() here to be called from CPU 0, but our - * init thread may have been "borrowed" by another CPU in the meantime - * se we pin us down to CPU 0 for a short while +void __init smp_cpus_done(unsigned int max_cpus) +{ + /* + * We want the setup_cpu() here to be called on the boot CPU, but + * init might run on any CPU, so make sure it's invoked on the boot + * CPU. */ - alloc_cpumask_var(&old_mask, GFP_NOWAIT); - cpumask_copy(old_mask, ¤t->cpus_allowed); - set_cpus_allowed_ptr(current, cpumask_of(boot_cpuid)); - if (smp_ops && smp_ops->setup_cpu) - smp_ops->setup_cpu(boot_cpuid); - - set_cpus_allowed_ptr(current, old_mask); - - free_cpumask_var(old_mask); + work_on_cpu_safe(boot_cpuid, smp_setup_cpu_workfn, NULL); if (smp_ops && smp_ops->bringup_done) smp_ops->bringup_done(); @@ -812,7 +809,6 @@ void __init smp_cpus_done(unsigned int max_cpus) dump_numa_cpu_topology(); set_sched_topology(powerpc_topology); - } #ifdef CONFIG_HOTPLUG_CPU -- cgit From ea875ec94eafb858990f3fe9528501f983105653 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 13 Apr 2017 10:17:07 +0200 Subject: sparc/sysfs: Replace racy task affinity logic The mmustat_enable sysfs file accessor functions must run code on the target CPU. This is achieved by temporarily setting the affinity of the calling user space thread to the requested CPU and reset it to the original affinity afterwards. That's racy vs. concurrent affinity settings for that thread resulting in code executing on the wrong CPU and overwriting the new affinity setting. Replace it by using work_on_cpu() which guarantees to run the code on the requested CPU. Protection against CPU hotplug is not required as the open sysfs file already prevents the removal from the CPU offline callback. Using the hotplug protected version would actually be wrong because it would deadlock against a CPU hotplug operation of the CPU associated to the sysfs file in progress. Signed-off-by: Thomas Gleixner Acked-by: David S. Miller Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: herbert@gondor.apana.org.au Cc: rjw@rjwysocki.net Cc: peterz@infradead.org Cc: benh@kernel.crashing.org Cc: bigeasy@linutronix.de Cc: jiangshanlai@gmail.com Cc: sparclinux@vger.kernel.org Cc: viresh.kumar@linaro.org Cc: mpe@ellerman.id.au Cc: tj@kernel.org Cc: lenb@kernel.org Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1704131001270.2408@nanos Signed-off-by: Thomas Gleixner --- arch/sparc/kernel/sysfs.c | 39 +++++++++++---------------------------- 1 file changed, 11 insertions(+), 28 deletions(-) diff --git a/arch/sparc/kernel/sysfs.c b/arch/sparc/kernel/sysfs.c index d63fc613e7a9..5fd352b759af 100644 --- a/arch/sparc/kernel/sysfs.c +++ b/arch/sparc/kernel/sysfs.c @@ -98,27 +98,7 @@ static struct attribute_group mmu_stat_group = { .name = "mmu_stats", }; -/* XXX convert to rusty's on_one_cpu */ -static unsigned long run_on_cpu(unsigned long cpu, - unsigned long (*func)(unsigned long), - unsigned long arg) -{ - cpumask_t old_affinity; - unsigned long ret; - - cpumask_copy(&old_affinity, ¤t->cpus_allowed); - /* should return -EINVAL to userspace */ - if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) - return 0; - - ret = func(arg); - - set_cpus_allowed_ptr(current, &old_affinity); - - return ret; -} - -static unsigned long read_mmustat_enable(unsigned long junk) +static long read_mmustat_enable(void *data __maybe_unused) { unsigned long ra = 0; @@ -127,11 +107,11 @@ static unsigned long read_mmustat_enable(unsigned long junk) return ra != 0; } -static unsigned long write_mmustat_enable(unsigned long val) +static long write_mmustat_enable(void *data) { - unsigned long ra, orig_ra; + unsigned long ra, orig_ra, *val = data; - if (val) + if (*val) ra = __pa(&per_cpu(mmu_stats, smp_processor_id())); else ra = 0UL; @@ -142,7 +122,8 @@ static unsigned long write_mmustat_enable(unsigned long val) static ssize_t show_mmustat_enable(struct device *s, struct device_attribute *attr, char *buf) { - unsigned long val = run_on_cpu(s->id, read_mmustat_enable, 0); + long val = work_on_cpu(s->id, read_mmustat_enable, NULL); + return sprintf(buf, "%lx\n", val); } @@ -150,13 +131,15 @@ static ssize_t store_mmustat_enable(struct device *s, struct device_attribute *attr, const char *buf, size_t count) { - unsigned long val, err; - int ret = sscanf(buf, "%lu", &val); + unsigned long val; + long err; + int ret; + ret = sscanf(buf, "%lu", &val); if (ret != 1) return -EINVAL; - err = run_on_cpu(s->id, write_mmustat_enable, val); + err = work_on_cpu(s->id, write_mmustat_enable, &val); if (err) return -EIO; -- cgit From a5cbdf693a60d5b86d4d21dfedd90f17754eb273 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 12 Apr 2017 22:07:33 +0200 Subject: ACPI/processor: Fix error handling in __acpi_processor_start() When acpi_install_notify_handler() fails the cooling device stays registered and the sysfs files created via acpi_pss_perf_init() are leaked and the function returns success. Undo acpi_pss_perf_init() and return a proper error code. Signed-off-by: Thomas Gleixner Cc: Fenghua Yu Cc: Tony Luck Cc: Herbert Xu Cc: "Rafael J. Wysocki" Cc: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Sebastian Siewior Cc: Lai Jiangshan Cc: linux-acpi@vger.kernel.org Cc: Viresh Kumar Cc: Michael Ellerman Cc: Tejun Heo Cc: "David S. Miller" Cc: Len Brown Link: http://lkml.kernel.org/r/20170412201042.695499645@linutronix.de Signed-off-by: Thomas Gleixner --- drivers/acpi/processor_driver.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c index 9d5f0c7ed3f7..eab8cdad7dc3 100644 --- a/drivers/acpi/processor_driver.c +++ b/drivers/acpi/processor_driver.c @@ -251,6 +251,9 @@ static int __acpi_processor_start(struct acpi_device *device) if (ACPI_SUCCESS(status)) return 0; + result = -ENODEV; + acpi_pss_perf_exit(pr, device); + err_power_exit: acpi_processor_power_exit(pr); return result; -- cgit From 8153f9ac43897f9f4786b30badc134fcc1a4fb11 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 12 Apr 2017 22:07:34 +0200 Subject: ACPI/processor: Replace racy task affinity logic acpi_processor_get_throttling() requires to invoke the getter function on the target CPU. This is achieved by temporarily setting the affinity of the calling user space thread to the requested CPU and reset it to the original affinity afterwards. That's racy vs. CPU hotplug and concurrent affinity settings for that thread resulting in code executing on the wrong CPU and overwriting the new affinity setting. acpi_processor_get_throttling() is invoked in two ways: 1) The CPU online callback, which is already running on the target CPU and obviously protected against hotplug and not affected by affinity settings. 2) The ACPI driver probe function, which is not protected against hotplug during modprobe. Switch it over to work_on_cpu() and protect the probe function against CPU hotplug. Signed-off-by: Thomas Gleixner Cc: Fenghua Yu Cc: Tony Luck Cc: Herbert Xu Cc: "Rafael J. Wysocki" Cc: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Sebastian Siewior Cc: Lai Jiangshan Cc: linux-acpi@vger.kernel.org Cc: Viresh Kumar Cc: Michael Ellerman Cc: Tejun Heo Cc: "David S. Miller" Cc: Len Brown Link: http://lkml.kernel.org/r/20170412201042.785920903@linutronix.de Signed-off-by: Thomas Gleixner --- drivers/acpi/processor_driver.c | 7 ++++- drivers/acpi/processor_throttling.c | 62 +++++++++++++++++++++---------------- 2 files changed, 42 insertions(+), 27 deletions(-) diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c index eab8cdad7dc3..8697a82bd465 100644 --- a/drivers/acpi/processor_driver.c +++ b/drivers/acpi/processor_driver.c @@ -262,11 +262,16 @@ err_power_exit: static int acpi_processor_start(struct device *dev) { struct acpi_device *device = ACPI_COMPANION(dev); + int ret; if (!device) return -ENODEV; - return __acpi_processor_start(device); + /* Protect against concurrent CPU hotplug operations */ + get_online_cpus(); + ret = __acpi_processor_start(device); + put_online_cpus(); + return ret; } static int acpi_processor_stop(struct device *dev) diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c index a12f96cc93ff..3de34633f7f9 100644 --- a/drivers/acpi/processor_throttling.c +++ b/drivers/acpi/processor_throttling.c @@ -62,8 +62,8 @@ struct acpi_processor_throttling_arg { #define THROTTLING_POSTCHANGE (2) static int acpi_processor_get_throttling(struct acpi_processor *pr); -int acpi_processor_set_throttling(struct acpi_processor *pr, - int state, bool force); +static int __acpi_processor_set_throttling(struct acpi_processor *pr, + int state, bool force, bool direct); static int acpi_processor_update_tsd_coord(void) { @@ -891,7 +891,8 @@ static int acpi_processor_get_throttling_ptc(struct acpi_processor *pr) ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Invalid throttling state, reset\n")); state = 0; - ret = acpi_processor_set_throttling(pr, state, true); + ret = __acpi_processor_set_throttling(pr, state, true, + true); if (ret) return ret; } @@ -901,36 +902,31 @@ static int acpi_processor_get_throttling_ptc(struct acpi_processor *pr) return 0; } -static int acpi_processor_get_throttling(struct acpi_processor *pr) +static long __acpi_processor_get_throttling(void *data) { - cpumask_var_t saved_mask; - int ret; + struct acpi_processor *pr = data; + + return pr->throttling.acpi_processor_get_throttling(pr); +} +static int acpi_processor_get_throttling(struct acpi_processor *pr) +{ if (!pr) return -EINVAL; if (!pr->flags.throttling) return -ENODEV; - if (!alloc_cpumask_var(&saved_mask, GFP_KERNEL)) - return -ENOMEM; - /* - * Migrate task to the cpu pointed by pr. + * This is either called from the CPU hotplug callback of + * processor_driver or via the ACPI probe function. In the latter + * case the CPU is not guaranteed to be online. Both call sites are + * protected against CPU hotplug. */ - cpumask_copy(saved_mask, ¤t->cpus_allowed); - /* FIXME: use work_on_cpu() */ - if (set_cpus_allowed_ptr(current, cpumask_of(pr->id))) { - /* Can't migrate to the target pr->id CPU. Exit */ - free_cpumask_var(saved_mask); + if (!cpu_online(pr->id)) return -ENODEV; - } - ret = pr->throttling.acpi_processor_get_throttling(pr); - /* restore the previous state */ - set_cpus_allowed_ptr(current, saved_mask); - free_cpumask_var(saved_mask); - return ret; + return work_on_cpu(pr->id, __acpi_processor_get_throttling, pr); } static int acpi_processor_get_fadt_info(struct acpi_processor *pr) @@ -1080,8 +1076,15 @@ static long acpi_processor_throttling_fn(void *data) arg->target_state, arg->force); } -int acpi_processor_set_throttling(struct acpi_processor *pr, - int state, bool force) +static int call_on_cpu(int cpu, long (*fn)(void *), void *arg, bool direct) +{ + if (direct) + return fn(arg); + return work_on_cpu(cpu, fn, arg); +} + +static int __acpi_processor_set_throttling(struct acpi_processor *pr, + int state, bool force, bool direct) { int ret = 0; unsigned int i; @@ -1130,7 +1133,8 @@ int acpi_processor_set_throttling(struct acpi_processor *pr, arg.pr = pr; arg.target_state = state; arg.force = force; - ret = work_on_cpu(pr->id, acpi_processor_throttling_fn, &arg); + ret = call_on_cpu(pr->id, acpi_processor_throttling_fn, &arg, + direct); } else { /* * When the T-state coordination is SW_ALL or HW_ALL, @@ -1163,8 +1167,8 @@ int acpi_processor_set_throttling(struct acpi_processor *pr, arg.pr = match_pr; arg.target_state = state; arg.force = force; - ret = work_on_cpu(pr->id, acpi_processor_throttling_fn, - &arg); + ret = call_on_cpu(pr->id, acpi_processor_throttling_fn, + &arg, direct); } } /* @@ -1182,6 +1186,12 @@ int acpi_processor_set_throttling(struct acpi_processor *pr, return ret; } +int acpi_processor_set_throttling(struct acpi_processor *pr, int state, + bool force) +{ + return __acpi_processor_set_throttling(pr, state, force, false); +} + int acpi_processor_get_throttling_info(struct acpi_processor *pr) { int result = 0; -- cgit From 38f05ed04beb276f780fcd2b5c0b78c76d0b3c0c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 12 Apr 2017 22:55:03 +0200 Subject: cpufreq/ia64: Replace racy task affinity logic The get() and target() callbacks must run on the affected cpu. This is achieved by temporarily setting the affinity of the calling thread to the requested CPU and reset it to the original affinity afterwards. That's racy vs. concurrent affinity settings for that thread resulting in code executing on the wrong CPU and overwriting the new affinity setting. Replace it by work_on_cpu(). All call pathes which invoke the callbacks are already protected against CPU hotplug. Signed-off-by: Thomas Gleixner Acked-by: Viresh Kumar Cc: Fenghua Yu Cc: Tony Luck Cc: Herbert Xu Cc: "Rafael J. Wysocki" Cc: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Sebastian Siewior Cc: linux-pm@vger.kernel.org Cc: Lai Jiangshan Cc: Michael Ellerman Cc: Tejun Heo Cc: "David S. Miller" Cc: Len Brown Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1704122231100.2548@nanos Signed-off-by: Thomas Gleixner --- drivers/cpufreq/ia64-acpi-cpufreq.c | 92 ++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 53 deletions(-) diff --git a/drivers/cpufreq/ia64-acpi-cpufreq.c b/drivers/cpufreq/ia64-acpi-cpufreq.c index e28a31a40829..a757c0a1e7b5 100644 --- a/drivers/cpufreq/ia64-acpi-cpufreq.c +++ b/drivers/cpufreq/ia64-acpi-cpufreq.c @@ -34,6 +34,11 @@ struct cpufreq_acpi_io { unsigned int resume; }; +struct cpufreq_acpi_req { + unsigned int cpu; + unsigned int state; +}; + static struct cpufreq_acpi_io *acpi_io_data[NR_CPUS]; static struct cpufreq_driver acpi_cpufreq_driver; @@ -83,8 +88,7 @@ processor_get_pstate ( static unsigned extract_clock ( struct cpufreq_acpi_io *data, - unsigned value, - unsigned int cpu) + unsigned value) { unsigned long i; @@ -98,60 +102,43 @@ extract_clock ( } -static unsigned int +static long processor_get_freq ( - struct cpufreq_acpi_io *data, - unsigned int cpu) + void *arg) { - int ret = 0; - u32 value = 0; - cpumask_t saved_mask; - unsigned long clock_freq; + struct cpufreq_acpi_req *req = arg; + unsigned int cpu = req->cpu; + struct cpufreq_acpi_io *data = acpi_io_data[cpu]; + u32 value; + int ret; pr_debug("processor_get_freq\n"); - - saved_mask = current->cpus_allowed; - set_cpus_allowed_ptr(current, cpumask_of(cpu)); if (smp_processor_id() != cpu) - goto migrate_end; + return -EAGAIN; /* processor_get_pstate gets the instantaneous frequency */ ret = processor_get_pstate(&value); - if (ret) { - set_cpus_allowed_ptr(current, &saved_mask); pr_warn("get performance failed with error %d\n", ret); - ret = 0; - goto migrate_end; + return ret; } - clock_freq = extract_clock(data, value, cpu); - ret = (clock_freq*1000); - -migrate_end: - set_cpus_allowed_ptr(current, &saved_mask); - return ret; + return 1000 * extract_clock(data, value); } -static int +static long processor_set_freq ( - struct cpufreq_acpi_io *data, - struct cpufreq_policy *policy, - int state) + void *arg) { - int ret = 0; - u32 value = 0; - cpumask_t saved_mask; - int retval; + struct cpufreq_acpi_req *req = arg; + unsigned int cpu = req->cpu; + struct cpufreq_acpi_io *data = acpi_io_data[cpu]; + int ret, state = req->state; + u32 value; pr_debug("processor_set_freq\n"); - - saved_mask = current->cpus_allowed; - set_cpus_allowed_ptr(current, cpumask_of(policy->cpu)); - if (smp_processor_id() != policy->cpu) { - retval = -EAGAIN; - goto migrate_end; - } + if (smp_processor_id() != cpu) + return -EAGAIN; if (state == data->acpi_data.state) { if (unlikely(data->resume)) { @@ -159,8 +146,7 @@ processor_set_freq ( data->resume = 0; } else { pr_debug("Already at target state (P%d)\n", state); - retval = 0; - goto migrate_end; + return 0; } } @@ -171,7 +157,6 @@ processor_set_freq ( * First we write the target state's 'control' value to the * control_register. */ - value = (u32) data->acpi_data.states[state].control; pr_debug("Transitioning to state: 0x%08x\n", value); @@ -179,17 +164,11 @@ processor_set_freq ( ret = processor_set_pstate(value); if (ret) { pr_warn("Transition failed with error %d\n", ret); - retval = -ENODEV; - goto migrate_end; + return -ENODEV; } data->acpi_data.state = state; - - retval = 0; - -migrate_end: - set_cpus_allowed_ptr(current, &saved_mask); - return (retval); + return 0; } @@ -197,11 +176,13 @@ static unsigned int acpi_cpufreq_get ( unsigned int cpu) { - struct cpufreq_acpi_io *data = acpi_io_data[cpu]; + struct cpufreq_acpi_req req; + long ret; - pr_debug("acpi_cpufreq_get\n"); + req.cpu = cpu; + ret = work_on_cpu(cpu, processor_get_freq, &req); - return processor_get_freq(data, cpu); + return ret > 0 ? (unsigned int) ret : 0; } @@ -210,7 +191,12 @@ acpi_cpufreq_target ( struct cpufreq_policy *policy, unsigned int index) { - return processor_set_freq(acpi_io_data[policy->cpu], policy, index); + struct cpufreq_acpi_req req; + + req.cpu = policy->cpu; + req.state = index; + + return work_on_cpu(req.cpu, processor_set_freq, &req); } static int -- cgit From 205dcc1ecbc566cbc20acf246e68de3b080b3ecf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 12 Apr 2017 22:07:36 +0200 Subject: cpufreq/sh: Replace racy task affinity logic The target() callback must run on the affected cpu. This is achieved by temporarily setting the affinity of the calling thread to the requested CPU and reset it to the original affinity afterwards. That's racy vs. concurrent affinity settings for that thread resulting in code executing on the wrong CPU. Replace it by work_on_cpu(). All call pathes which invoke the callbacks are already protected against CPU hotplug. Signed-off-by: Thomas Gleixner Acked-by: Viresh Kumar Cc: Fenghua Yu Cc: Tony Luck Cc: Herbert Xu Cc: "Rafael J. Wysocki" Cc: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Sebastian Siewior Cc: linux-pm@vger.kernel.org Cc: Lai Jiangshan Cc: Michael Ellerman Cc: Tejun Heo Cc: "David S. Miller" Cc: Len Brown Link: http://lkml.kernel.org/r/20170412201042.958216363@linutronix.de Signed-off-by: Thomas Gleixner --- drivers/cpufreq/sh-cpufreq.c | 45 ++++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/drivers/cpufreq/sh-cpufreq.c b/drivers/cpufreq/sh-cpufreq.c index 86628e22b2a3..719c3d9f07fb 100644 --- a/drivers/cpufreq/sh-cpufreq.c +++ b/drivers/cpufreq/sh-cpufreq.c @@ -30,54 +30,63 @@ static DEFINE_PER_CPU(struct clk, sh_cpuclk); +struct cpufreq_target { + struct cpufreq_policy *policy; + unsigned int freq; +}; + static unsigned int sh_cpufreq_get(unsigned int cpu) { return (clk_get_rate(&per_cpu(sh_cpuclk, cpu)) + 500) / 1000; } -/* - * Here we notify other drivers of the proposed change and the final change. - */ -static int sh_cpufreq_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) +static long __sh_cpufreq_target(void *arg) { - unsigned int cpu = policy->cpu; + struct cpufreq_target *target = arg; + struct cpufreq_policy *policy = target->policy; + int cpu = policy->cpu; struct clk *cpuclk = &per_cpu(sh_cpuclk, cpu); - cpumask_t cpus_allowed; struct cpufreq_freqs freqs; struct device *dev; long freq; - cpus_allowed = current->cpus_allowed; - set_cpus_allowed_ptr(current, cpumask_of(cpu)); - - BUG_ON(smp_processor_id() != cpu); + if (smp_processor_id() != cpu) + return -ENODEV; dev = get_cpu_device(cpu); /* Convert target_freq from kHz to Hz */ - freq = clk_round_rate(cpuclk, target_freq * 1000); + freq = clk_round_rate(cpuclk, target->freq * 1000); if (freq < (policy->min * 1000) || freq > (policy->max * 1000)) return -EINVAL; - dev_dbg(dev, "requested frequency %u Hz\n", target_freq * 1000); + dev_dbg(dev, "requested frequency %u Hz\n", target->freq * 1000); freqs.old = sh_cpufreq_get(cpu); freqs.new = (freq + 500) / 1000; freqs.flags = 0; - cpufreq_freq_transition_begin(policy, &freqs); - set_cpus_allowed_ptr(current, &cpus_allowed); + cpufreq_freq_transition_begin(target->policy, &freqs); clk_set_rate(cpuclk, freq); - cpufreq_freq_transition_end(policy, &freqs, 0); + cpufreq_freq_transition_end(target->policy, &freqs, 0); dev_dbg(dev, "set frequency %lu Hz\n", freq); - return 0; } +/* + * Here we notify other drivers of the proposed change and the final change. + */ +static int sh_cpufreq_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + struct cpufreq_target data = { .policy = policy, .freq = target_freq }; + + return work_on_cpu(policy->cpu, __sh_cpufreq_target, &data); +} + static int sh_cpufreq_verify(struct cpufreq_policy *policy) { struct clk *cpuclk = &per_cpu(sh_cpuclk, policy->cpu); -- cgit From 9fe24c4e92d3963d92d7d383e28ed098bd5689d8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 12 Apr 2017 22:07:37 +0200 Subject: cpufreq/sparc-us3: Replace racy task affinity logic The access to the safari config register in the CPU frequency functions must be executed on the target CPU. This is achieved by temporarily setting the affinity of the calling user space thread to the requested CPU and reset it to the original affinity afterwards. That's racy vs. CPU hotplug and concurrent affinity settings for that thread resulting in code executing on the wrong CPU and overwriting the new affinity setting. Replace it by a straight forward smp function call. Signed-off-by: Thomas Gleixner Acked-by: Viresh Kumar Cc: Fenghua Yu Cc: Tony Luck Cc: Herbert Xu Cc: "Rafael J. Wysocki" Cc: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Sebastian Siewior Cc: linux-pm@vger.kernel.org Cc: Lai Jiangshan Cc: Michael Ellerman Cc: Tejun Heo Cc: "David S. Miller" Cc: Len Brown Link: http://lkml.kernel.org/r/20170412201043.047558840@linutronix.de Signed-off-by: Thomas Gleixner --- drivers/cpufreq/sparc-us3-cpufreq.c | 46 +++++++++++++------------------------ 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/drivers/cpufreq/sparc-us3-cpufreq.c b/drivers/cpufreq/sparc-us3-cpufreq.c index a8d86a449ca1..30645b0118f9 100644 --- a/drivers/cpufreq/sparc-us3-cpufreq.c +++ b/drivers/cpufreq/sparc-us3-cpufreq.c @@ -35,22 +35,28 @@ static struct us3_freq_percpu_info *us3_freq_table; #define SAFARI_CFG_DIV_32 0x0000000080000000UL #define SAFARI_CFG_DIV_MASK 0x00000000C0000000UL -static unsigned long read_safari_cfg(void) +static void read_safari_cfg(void *arg) { - unsigned long ret; + unsigned long ret, *val = arg; __asm__ __volatile__("ldxa [%%g0] %1, %0" : "=&r" (ret) : "i" (ASI_SAFARI_CONFIG)); - return ret; + *val = ret; } -static void write_safari_cfg(unsigned long val) +static void update_safari_cfg(void *arg) { + unsigned long reg, *new_bits = arg; + + read_safari_cfg(®); + reg &= ~SAFARI_CFG_DIV_MASK; + reg |= *new_bits; + __asm__ __volatile__("stxa %0, [%%g0] %1\n\t" "membar #Sync" : /* no outputs */ - : "r" (val), "i" (ASI_SAFARI_CONFIG) + : "r" (reg), "i" (ASI_SAFARI_CONFIG) : "memory"); } @@ -78,29 +84,17 @@ static unsigned long get_current_freq(unsigned int cpu, unsigned long safari_cfg static unsigned int us3_freq_get(unsigned int cpu) { - cpumask_t cpus_allowed; unsigned long reg; - unsigned int ret; - - cpumask_copy(&cpus_allowed, ¤t->cpus_allowed); - set_cpus_allowed_ptr(current, cpumask_of(cpu)); - - reg = read_safari_cfg(); - ret = get_current_freq(cpu, reg); - - set_cpus_allowed_ptr(current, &cpus_allowed); - return ret; + if (smp_call_function_single(cpu, read_safari_cfg, ®, 1)) + return 0; + return get_current_freq(cpu, reg); } static int us3_freq_target(struct cpufreq_policy *policy, unsigned int index) { unsigned int cpu = policy->cpu; - unsigned long new_bits, new_freq, reg; - cpumask_t cpus_allowed; - - cpumask_copy(&cpus_allowed, ¤t->cpus_allowed); - set_cpus_allowed_ptr(current, cpumask_of(cpu)); + unsigned long new_bits, new_freq; new_freq = sparc64_get_clock_tick(cpu) / 1000; switch (index) { @@ -121,15 +115,7 @@ static int us3_freq_target(struct cpufreq_policy *policy, unsigned int index) BUG(); } - reg = read_safari_cfg(); - - reg &= ~SAFARI_CFG_DIV_MASK; - reg |= new_bits; - write_safari_cfg(reg); - - set_cpus_allowed_ptr(current, &cpus_allowed); - - return 0; + return smp_call_function_single(cpu, update_safari_cfg, &new_bits, 1); } static int __init us3_freq_cpu_init(struct cpufreq_policy *policy) -- cgit From 12699ac53a2e5fbd1fd7c164b11685d55c8aa28b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 13 Apr 2017 10:22:43 +0200 Subject: cpufreq/sparc-us2e: Replace racy task affinity logic The access to the HBIRD_ESTAR_MODE register in the cpu frequency control functions must happen on the target CPU. This is achieved by temporarily setting the affinity of the calling user space thread to the requested CPU and reset it to the original affinity afterwards. That's racy vs. CPU hotplug and concurrent affinity settings for that thread resulting in code executing on the wrong CPU and overwriting the new affinity setting. Replace it by a straight forward smp function call. Signed-off-by: Thomas Gleixner Acked-by: Viresh Kumar Cc: Fenghua Yu Cc: Tony Luck Cc: Herbert Xu Cc: "Rafael J. Wysocki" Cc: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Sebastian Siewior Cc: linux-pm@vger.kernel.org Cc: Lai Jiangshan Cc: Michael Ellerman Cc: Tejun Heo Cc: "David S. Miller" Cc: Len Brown Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1704131020280.2408@nanos Signed-off-by: Thomas Gleixner --- drivers/cpufreq/sparc-us2e-cpufreq.c | 45 +++++++++++++++++------------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/drivers/cpufreq/sparc-us2e-cpufreq.c b/drivers/cpufreq/sparc-us2e-cpufreq.c index 35ddb6da93aa..90f33efee5fc 100644 --- a/drivers/cpufreq/sparc-us2e-cpufreq.c +++ b/drivers/cpufreq/sparc-us2e-cpufreq.c @@ -118,10 +118,6 @@ static void us2e_transition(unsigned long estar, unsigned long new_bits, unsigned long clock_tick, unsigned long old_divisor, unsigned long divisor) { - unsigned long flags; - - local_irq_save(flags); - estar &= ~ESTAR_MODE_DIV_MASK; /* This is based upon the state transition diagram in the IIe manual. */ @@ -152,8 +148,6 @@ static void us2e_transition(unsigned long estar, unsigned long new_bits, } else { BUG(); } - - local_irq_restore(flags); } static unsigned long index_to_estar_mode(unsigned int index) @@ -229,48 +223,51 @@ static unsigned long estar_to_divisor(unsigned long estar) return ret; } +static void __us2e_freq_get(void *arg) +{ + unsigned long *estar = arg; + + *estar = read_hbreg(HBIRD_ESTAR_MODE_ADDR); +} + static unsigned int us2e_freq_get(unsigned int cpu) { - cpumask_t cpus_allowed; unsigned long clock_tick, estar; - cpumask_copy(&cpus_allowed, ¤t->cpus_allowed); - set_cpus_allowed_ptr(current, cpumask_of(cpu)); - clock_tick = sparc64_get_clock_tick(cpu) / 1000; - estar = read_hbreg(HBIRD_ESTAR_MODE_ADDR); - - set_cpus_allowed_ptr(current, &cpus_allowed); + if (smp_call_function_single(cpu, __us2e_freq_get, &estar, 1)) + return 0; return clock_tick / estar_to_divisor(estar); } -static int us2e_freq_target(struct cpufreq_policy *policy, unsigned int index) +static void __us2e_freq_target(void *arg) { - unsigned int cpu = policy->cpu; + unsigned int cpu = smp_processor_id(); + unsigned int *index = arg; unsigned long new_bits, new_freq; unsigned long clock_tick, divisor, old_divisor, estar; - cpumask_t cpus_allowed; - - cpumask_copy(&cpus_allowed, ¤t->cpus_allowed); - set_cpus_allowed_ptr(current, cpumask_of(cpu)); new_freq = clock_tick = sparc64_get_clock_tick(cpu) / 1000; - new_bits = index_to_estar_mode(index); - divisor = index_to_divisor(index); + new_bits = index_to_estar_mode(*index); + divisor = index_to_divisor(*index); new_freq /= divisor; estar = read_hbreg(HBIRD_ESTAR_MODE_ADDR); old_divisor = estar_to_divisor(estar); - if (old_divisor != divisor) + if (old_divisor != divisor) { us2e_transition(estar, new_bits, clock_tick * 1000, old_divisor, divisor); + } +} - set_cpus_allowed_ptr(current, &cpus_allowed); +static int us2e_freq_target(struct cpufreq_policy *policy, unsigned int index) +{ + unsigned int cpu = policy->cpu; - return 0; + return smp_call_function_single(cpu, __us2e_freq_target, &index, 1); } static int __init us2e_freq_cpu_init(struct cpufreq_policy *policy) -- cgit From 73810a069120aa831debb4d967310ab900f628ad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 13 Apr 2017 10:20:23 +0200 Subject: crypto: N2 - Replace racy task affinity logic spu_queue_register() needs to invoke setup functions on a particular CPU. This is achieved by temporarily setting the affinity of the calling user space thread to the requested CPU and reset it to the original affinity afterwards. That's racy vs. CPU hotplug and concurrent affinity settings for that thread resulting in code executing on the wrong CPU and overwriting the new affinity setting. Replace it by using work_on_cpu_safe() which guarantees to run the code on the requested CPU or to fail in case the CPU is offline. Signed-off-by: Thomas Gleixner Acked-by: Herbert Xu Acked-by: "David S. Miller" Cc: Fenghua Yu Cc: Tony Luck Cc: "Rafael J. Wysocki" Cc: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Sebastian Siewior Cc: Lai Jiangshan Cc: Viresh Kumar Cc: linux-crypto@vger.kernel.org Cc: Michael Ellerman Cc: Tejun Heo Cc: Len Brown Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1704131019420.2408@nanos Signed-off-by: Thomas Gleixner --- drivers/crypto/n2_core.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/crypto/n2_core.c b/drivers/crypto/n2_core.c index c5aac25a5738..4ecb77aa60e1 100644 --- a/drivers/crypto/n2_core.c +++ b/drivers/crypto/n2_core.c @@ -65,6 +65,11 @@ struct spu_queue { struct list_head list; }; +struct spu_qreg { + struct spu_queue *queue; + unsigned long type; +}; + static struct spu_queue **cpu_to_cwq; static struct spu_queue **cpu_to_mau; @@ -1631,31 +1636,27 @@ static void queue_cache_destroy(void) kmem_cache_destroy(queue_cache[HV_NCS_QTYPE_CWQ - 1]); } -static int spu_queue_register(struct spu_queue *p, unsigned long q_type) +static long spu_queue_register_workfn(void *arg) { - cpumask_var_t old_allowed; + struct spu_qreg *qr = arg; + struct spu_queue *p = qr->queue; + unsigned long q_type = qr->type; unsigned long hv_ret; - if (cpumask_empty(&p->sharing)) - return -EINVAL; - - if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL)) - return -ENOMEM; - - cpumask_copy(old_allowed, ¤t->cpus_allowed); - - set_cpus_allowed_ptr(current, &p->sharing); - hv_ret = sun4v_ncs_qconf(q_type, __pa(p->q), CWQ_NUM_ENTRIES, &p->qhandle); if (!hv_ret) sun4v_ncs_sethead_marker(p->qhandle, 0); - set_cpus_allowed_ptr(current, old_allowed); + return hv_ret ? -EINVAL : 0; +} - free_cpumask_var(old_allowed); +static int spu_queue_register(struct spu_queue *p, unsigned long q_type) +{ + int cpu = cpumask_any_and(&p->sharing, cpu_online_mask); + struct spu_qreg qr = { .queue = p, .type = q_type }; - return (hv_ret ? -EINVAL : 0); + return work_on_cpu_safe(cpu, spu_queue_register_workfn, &qr); } static int spu_queue_setup(struct spu_queue *p) -- cgit From 21173d0b4d2a0b9e9e5f3155cf2cfc5781a6f4b1 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Tue, 18 Apr 2017 08:25:05 -0400 Subject: sched/x86: Update reschedule warning text Modify the reschedule warning to output the offline CPU number and use a better debug message. Signed-off-by: Prarit Bhargava Cc: Andrew Morton Cc: Daniel Bristot de Oliveira Cc: Hidehiro Kawai Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1492518305-3808-1-git-send-email-prarit@redhat.com [ Tweaked the warning message. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index d3c66a15bbde..3cab8415389a 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -124,7 +124,7 @@ static bool smp_no_nmi_ipi = false; static void native_smp_send_reschedule(int cpu) { if (unlikely(cpu_is_offline(cpu))) { - WARN_ON(1); + WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu); return; } apic->send_IPI(cpu, RESCHEDULE_VECTOR); -- cgit