summaryrefslogtreecommitdiff
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c1209
1 files changed, 952 insertions, 257 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e7e453492cff..ff74fca39ed2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -97,7 +97,7 @@ int sysctl_sched_rt_runtime = 950000;
*
* Normal scheduling state is serialized by rq->lock. __schedule() takes the
* local CPU's rq->lock, it optionally removes the task from the runqueue and
- * always looks at the local rq data structures to find the most elegible task
+ * always looks at the local rq data structures to find the most eligible task
* to run next.
*
* Task enqueue is also under rq->lock, possibly taken from another CPU.
@@ -320,14 +320,6 @@ void update_rq_clock(struct rq *rq)
update_rq_clock_task(rq, delta);
}
-static inline void
-rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func)
-{
- csd->flags = 0;
- csd->func = func;
- csd->info = rq;
-}
-
#ifdef CONFIG_SCHED_HRTICK
/*
* Use HR-timers to deliver accurate preemption points.
@@ -428,7 +420,7 @@ void hrtick_start(struct rq *rq, u64 delay)
static void hrtick_rq_init(struct rq *rq)
{
#ifdef CONFIG_SMP
- rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
+ INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
#endif
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
rq->hrtick_timer.function = hrtick;
@@ -518,7 +510,7 @@ static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
/*
* Atomically grab the task, if ->wake_q is !nil already it means
- * its already queued (either by us or someone else) and will get the
+ * it's already queued (either by us or someone else) and will get the
* wakeup due to that.
*
* In order to ensure that a pending wakeup will observe our pending
@@ -769,7 +761,7 @@ bool sched_can_stop_tick(struct rq *rq)
return false;
/*
- * If there are more than one RR tasks, we need the tick to effect the
+ * If there are more than one RR tasks, we need the tick to affect the
* actual RR behaviour.
*/
if (rq->rt.rr_nr_running) {
@@ -1187,14 +1179,14 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
* accounting was performed at enqueue time and we can just return
* here.
*
- * Need to be careful of the following enqeueue/dequeue ordering
+ * Need to be careful of the following enqueue/dequeue ordering
* problem too
*
* enqueue(taskA)
* // sched_uclamp_used gets enabled
* enqueue(taskB)
* dequeue(taskA)
- * // Must not decrement bukcet->tasks here
+ * // Must not decrement bucket->tasks here
* dequeue(taskB)
*
* where we could end up with stale data in uc_se and
@@ -1413,17 +1405,24 @@ done:
static int uclamp_validate(struct task_struct *p,
const struct sched_attr *attr)
{
- unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
- unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
+ int util_min = p->uclamp_req[UCLAMP_MIN].value;
+ int util_max = p->uclamp_req[UCLAMP_MAX].value;
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
- lower_bound = attr->sched_util_min;
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
- upper_bound = attr->sched_util_max;
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
+ util_min = attr->sched_util_min;
- if (lower_bound > upper_bound)
- return -EINVAL;
- if (upper_bound > SCHED_CAPACITY_SCALE)
+ if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
+ return -EINVAL;
+ }
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
+ util_max = attr->sched_util_max;
+
+ if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
+ return -EINVAL;
+ }
+
+ if (util_min != -1 && util_max != -1 && util_min > util_max)
return -EINVAL;
/*
@@ -1438,20 +1437,41 @@ static int uclamp_validate(struct task_struct *p,
return 0;
}
+static bool uclamp_reset(const struct sched_attr *attr,
+ enum uclamp_id clamp_id,
+ struct uclamp_se *uc_se)
+{
+ /* Reset on sched class change for a non user-defined clamp value. */
+ if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
+ !uc_se->user_defined)
+ return true;
+
+ /* Reset on sched_util_{min,max} == -1. */
+ if (clamp_id == UCLAMP_MIN &&
+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
+ attr->sched_util_min == -1) {
+ return true;
+ }
+
+ if (clamp_id == UCLAMP_MAX &&
+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
+ attr->sched_util_max == -1) {
+ return true;
+ }
+
+ return false;
+}
+
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr)
{
enum uclamp_id clamp_id;
- /*
- * On scheduling class change, reset to default clamps for tasks
- * without a task-specific value.
- */
for_each_clamp_id(clamp_id) {
struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
+ unsigned int value;
- /* Keep using defined clamps across class changes */
- if (uc_se->user_defined)
+ if (!uclamp_reset(attr, clamp_id, uc_se))
continue;
/*
@@ -1459,21 +1479,25 @@ static void __setscheduler_uclamp(struct task_struct *p,
* at runtime.
*/
if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
- __uclamp_update_util_min_rt_default(p);
+ value = sysctl_sched_uclamp_util_min_rt_default;
else
- uclamp_se_set(uc_se, uclamp_none(clamp_id), false);
+ value = uclamp_none(clamp_id);
+
+ uclamp_se_set(uc_se, value, false);
}
if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
return;
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
+ attr->sched_util_min != -1) {
uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
attr->sched_util_min, true);
}
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
+ attr->sched_util_max != -1) {
uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
attr->sched_util_max, true);
}
@@ -1696,19 +1720,104 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
#ifdef CONFIG_SMP
+static void
+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
+
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask,
+ u32 flags);
+
+static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
+{
+ if (likely(!p->migration_disabled))
+ return;
+
+ if (p->cpus_ptr != &p->cpus_mask)
+ return;
+
+ /*
+ * Violates locking rules! see comment in __do_set_cpus_allowed().
+ */
+ __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
+}
+
+void migrate_disable(void)
+{
+ struct task_struct *p = current;
+
+ if (p->migration_disabled) {
+ p->migration_disabled++;
+ return;
+ }
+
+ preempt_disable();
+ this_rq()->nr_pinned++;
+ p->migration_disabled = 1;
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(migrate_disable);
+
+void migrate_enable(void)
+{
+ struct task_struct *p = current;
+
+ if (p->migration_disabled > 1) {
+ p->migration_disabled--;
+ return;
+ }
+
+ /*
+ * Ensure stop_task runs either before or after this, and that
+ * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
+ */
+ preempt_disable();
+ if (p->cpus_ptr != &p->cpus_mask)
+ __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+ /*
+ * Mustn't clear migration_disabled() until cpus_ptr points back at the
+ * regular cpus_mask, otherwise things that race (eg.
+ * select_fallback_rq) get confused.
+ */
+ barrier();
+ p->migration_disabled = 0;
+ this_rq()->nr_pinned--;
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(migrate_enable);
+
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+ return rq->nr_pinned;
+}
+
/*
* Per-CPU kthreads are allowed to run on !active && online CPUs, see
* __set_cpus_allowed_ptr() and select_fallback_rq().
*/
static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
{
+ /* When not in the task's cpumask, no point in looking further. */
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
return false;
- if (is_per_cpu_kthread(p))
+ /* migrate_disabled() must be allowed to finish. */
+ if (is_migration_disabled(p))
return cpu_online(cpu);
- return cpu_active(cpu);
+ /* Non kernel threads are not allowed during either online or offline. */
+ if (!(p->flags & PF_KTHREAD))
+ return cpu_active(cpu);
+
+ /* KTHREAD_IS_PER_CPU is always allowed. */
+ if (kthread_is_per_cpu(p))
+ return cpu_online(cpu);
+
+ /* Regular kernel threads don't get to stay during offline. */
+ if (cpu_rq(cpu)->balance_push)
+ return false;
+
+ /* But are allowed during online. */
+ return cpu_online(cpu);
}
/*
@@ -1750,8 +1859,16 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
}
struct migration_arg {
- struct task_struct *task;
- int dest_cpu;
+ struct task_struct *task;
+ int dest_cpu;
+ struct set_affinity_pending *pending;
+};
+
+struct set_affinity_pending {
+ refcount_t refs;
+ struct completion done;
+ struct cpu_stop_work stop_work;
+ struct migration_arg arg;
};
/*
@@ -1783,16 +1900,19 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
*/
static int migration_cpu_stop(void *data)
{
+ struct set_affinity_pending *pending;
struct migration_arg *arg = data;
struct task_struct *p = arg->task;
+ int dest_cpu = arg->dest_cpu;
struct rq *rq = this_rq();
+ bool complete = false;
struct rq_flags rf;
/*
* The original target CPU might have gone down and we might
* be on another CPU but it doesn't matter.
*/
- local_irq_disable();
+ local_irq_save(rf.flags);
/*
* We need to explicitly wake pending tasks before running
* __migrate_task() such that we will not miss enforcing cpus_ptr
@@ -1802,21 +1922,137 @@ static int migration_cpu_stop(void *data)
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
+
+ pending = p->migration_pending;
/*
* If task_rq(p) != rq, it cannot be migrated here, because we're
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
* we're holding p->pi_lock.
*/
if (task_rq(p) == rq) {
+ if (is_migration_disabled(p))
+ goto out;
+
+ if (pending) {
+ p->migration_pending = NULL;
+ complete = true;
+ }
+
+ /* migrate_enable() -- we must not race against SCA */
+ if (dest_cpu < 0) {
+ /*
+ * When this was migrate_enable() but we no longer
+ * have a @pending, a concurrent SCA 'fixed' things
+ * and we should be valid again. Nothing to do.
+ */
+ if (!pending) {
+ WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask));
+ goto out;
+ }
+
+ dest_cpu = cpumask_any_distribute(&p->cpus_mask);
+ }
+
if (task_on_rq_queued(p))
- rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
+ rq = __migrate_task(rq, &rf, p, dest_cpu);
else
- p->wake_cpu = arg->dest_cpu;
+ p->wake_cpu = dest_cpu;
+
+ } else if (dest_cpu < 0 || pending) {
+ /*
+ * This happens when we get migrated between migrate_enable()'s
+ * preempt_enable() and scheduling the stopper task. At that
+ * point we're a regular task again and not current anymore.
+ *
+ * A !PREEMPT kernel has a giant hole here, which makes it far
+ * more likely.
+ */
+
+ /*
+ * The task moved before the stopper got to run. We're holding
+ * ->pi_lock, so the allowed mask is stable - if it got
+ * somewhere allowed, we're done.
+ */
+ if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
+ p->migration_pending = NULL;
+ complete = true;
+ goto out;
+ }
+
+ /*
+ * When this was migrate_enable() but we no longer have an
+ * @pending, a concurrent SCA 'fixed' things and we should be
+ * valid again. Nothing to do.
+ */
+ if (!pending) {
+ WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask));
+ goto out;
+ }
+
+ /*
+ * When migrate_enable() hits a rq mis-match we can't reliably
+ * determine is_migration_disabled() and so have to chase after
+ * it.
+ */
+ task_rq_unlock(rq, p, &rf);
+ stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
+ &pending->arg, &pending->stop_work);
+ return 0;
}
- rq_unlock(rq, &rf);
- raw_spin_unlock(&p->pi_lock);
+out:
+ task_rq_unlock(rq, p, &rf);
+
+ if (complete)
+ complete_all(&pending->done);
+
+ /* For pending->{arg,stop_work} */
+ pending = arg->pending;
+ if (pending && refcount_dec_and_test(&pending->refs))
+ wake_up_var(&pending->refs);
- local_irq_enable();
+ return 0;
+}
+
+int push_cpu_stop(void *arg)
+{
+ struct rq *lowest_rq = NULL, *rq = this_rq();
+ struct task_struct *p = arg;
+
+ raw_spin_lock_irq(&p->pi_lock);
+ raw_spin_lock(&rq->lock);
+
+ if (task_rq(p) != rq)
+ goto out_unlock;
+
+ if (is_migration_disabled(p)) {
+ p->migration_flags |= MDF_PUSH;
+ goto out_unlock;
+ }
+
+ p->migration_flags &= ~MDF_PUSH;
+
+ if (p->sched_class->find_lock_rq)
+ lowest_rq = p->sched_class->find_lock_rq(p, rq);
+
+ if (!lowest_rq)
+ goto out_unlock;
+
+ // XXX validate p is still the highest prio task
+ if (task_rq(p) == rq) {
+ deactivate_task(rq, p, 0);
+ set_task_cpu(p, lowest_rq->cpu);
+ activate_task(lowest_rq, p, 0);
+ resched_curr(lowest_rq);
+ }
+
+ double_unlock_balance(rq, lowest_rq);
+
+out_unlock:
+ rq->push_busy = false;
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ put_task_struct(p);
return 0;
}
@@ -1824,18 +2060,39 @@ static int migration_cpu_stop(void *data)
* sched_class::set_cpus_allowed must do the below, but is not required to
* actually call this function.
*/
-void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
{
+ if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
+ p->cpus_ptr = new_mask;
+ return;
+ }
+
cpumask_copy(&p->cpus_mask, new_mask);
p->nr_cpus_allowed = cpumask_weight(new_mask);
}
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+static void
+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
{
struct rq *rq = task_rq(p);
bool queued, running;
- lockdep_assert_held(&p->pi_lock);
+ /*
+ * This here violates the locking rules for affinity, since we're only
+ * supposed to change these variables while holding both rq->lock and
+ * p->pi_lock.
+ *
+ * HOWEVER, it magically works, because ttwu() is the only code that
+ * accesses these variables under p->pi_lock and only does so after
+ * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
+ * before finish_task().
+ *
+ * XXX do further audits, this smells like something putrid.
+ */
+ if (flags & SCA_MIGRATE_DISABLE)
+ SCHED_WARN_ON(!p->on_cpu);
+ else
+ lockdep_assert_held(&p->pi_lock);
queued = task_on_rq_queued(p);
running = task_current(rq, p);
@@ -1851,7 +2108,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
if (running)
put_prev_task(rq, p);
- p->sched_class->set_cpus_allowed(p, new_mask);
+ p->sched_class->set_cpus_allowed(p, new_mask, flags);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
@@ -1859,6 +2116,208 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
set_next_task(rq, p);
}
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ __do_set_cpus_allowed(p, new_mask, 0);
+}
+
+/*
+ * This function is wildly self concurrent; here be dragons.
+ *
+ *
+ * When given a valid mask, __set_cpus_allowed_ptr() must block until the
+ * designated task is enqueued on an allowed CPU. If that task is currently
+ * running, we have to kick it out using the CPU stopper.
+ *
+ * Migrate-Disable comes along and tramples all over our nice sandcastle.
+ * Consider:
+ *
+ * Initial conditions: P0->cpus_mask = [0, 1]
+ *
+ * P0@CPU0 P1
+ *
+ * migrate_disable();
+ * <preempted>
+ * set_cpus_allowed_ptr(P0, [1]);
+ *
+ * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes
+ * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region).
+ * This means we need the following scheme:
+ *
+ * P0@CPU0 P1
+ *
+ * migrate_disable();
+ * <preempted>
+ * set_cpus_allowed_ptr(P0, [1]);
+ * <blocks>
+ * <resumes>
+ * migrate_enable();
+ * __set_cpus_allowed_ptr();
+ * <wakes local stopper>
+ * `--> <woken on migration completion>
+ *
+ * Now the fun stuff: there may be several P1-like tasks, i.e. multiple
+ * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any
+ * task p are serialized by p->pi_lock, which we can leverage: the one that
+ * should come into effect at the end of the Migrate-Disable region is the last
+ * one. This means we only need to track a single cpumask (i.e. p->cpus_mask),
+ * but we still need to properly signal those waiting tasks at the appropriate
+ * moment.
+ *
+ * This is implemented using struct set_affinity_pending. The first
+ * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will
+ * setup an instance of that struct and install it on the targeted task_struct.
+ * Any and all further callers will reuse that instance. Those then wait for
+ * a completion signaled at the tail of the CPU stopper callback (1), triggered
+ * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()).
+ *
+ *
+ * (1) In the cases covered above. There is one more where the completion is
+ * signaled within affine_move_task() itself: when a subsequent affinity request
+ * cancels the need for an active migration. Consider:
+ *
+ * Initial conditions: P0->cpus_mask = [0, 1]
+ *
+ * P0@CPU0 P1 P2
+ *
+ * migrate_disable();
+ * <preempted>
+ * set_cpus_allowed_ptr(P0, [1]);
+ * <blocks>
+ * set_cpus_allowed_ptr(P0, [0, 1]);
+ * <signal completion>
+ * <awakes>
+ *
+ * Note that the above is safe vs a concurrent migrate_enable(), as any
+ * pending affinity completion is preceded by an uninstallation of
+ * p->migration_pending done with p->pi_lock held.
+ */
+static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
+ int dest_cpu, unsigned int flags)
+{
+ struct set_affinity_pending my_pending = { }, *pending = NULL;
+ struct migration_arg arg = {
+ .task = p,
+ .dest_cpu = dest_cpu,
+ };
+ bool complete = false;
+
+ /* Can the task run on the task's current CPU? If so, we're done */
+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+ struct task_struct *push_task = NULL;
+
+ if ((flags & SCA_MIGRATE_ENABLE) &&
+ (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
+ rq->push_busy = true;
+ push_task = get_task_struct(p);
+ }
+
+ pending = p->migration_pending;
+ if (pending) {
+ refcount_inc(&pending->refs);
+ p->migration_pending = NULL;
+ complete = true;
+ }
+ task_rq_unlock(rq, p, rf);
+
+ if (push_task) {
+ stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+ p, &rq->push_work);
+ }
+
+ if (complete)
+ goto do_complete;
+
+ return 0;
+ }
+
+ if (!(flags & SCA_MIGRATE_ENABLE)) {
+ /* serialized by p->pi_lock */
+ if (!p->migration_pending) {
+ /* Install the request */
+ refcount_set(&my_pending.refs, 1);
+ init_completion(&my_pending.done);
+ p->migration_pending = &my_pending;
+ } else {
+ pending = p->migration_pending;
+ refcount_inc(&pending->refs);
+ }
+ }
+ pending = p->migration_pending;
+ /*
+ * - !MIGRATE_ENABLE:
+ * we'll have installed a pending if there wasn't one already.
+ *
+ * - MIGRATE_ENABLE:
+ * we're here because the current CPU isn't matching anymore,
+ * the only way that can happen is because of a concurrent
+ * set_cpus_allowed_ptr() call, which should then still be
+ * pending completion.
+ *
+ * Either way, we really should have a @pending here.
+ */
+ if (WARN_ON_ONCE(!pending)) {
+ task_rq_unlock(rq, p, rf);
+ return -EINVAL;
+ }
+
+ if (flags & SCA_MIGRATE_ENABLE) {
+
+ refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
+ p->migration_flags &= ~MDF_PUSH;
+ task_rq_unlock(rq, p, rf);
+
+ pending->arg = (struct migration_arg) {
+ .task = p,
+ .dest_cpu = -1,
+ .pending = pending,
+ };
+
+ stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+ &pending->arg, &pending->stop_work);
+
+ return 0;
+ }
+
+ if (task_running(rq, p) || p->state == TASK_WAKING) {
+ /*
+ * Lessen races (and headaches) by delegating
+ * is_migration_disabled(p) checks to the stopper, which will
+ * run on the same CPU as said p.
+ */
+ task_rq_unlock(rq, p, rf);
+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+
+ } else {
+
+ if (!is_migration_disabled(p)) {
+ if (task_on_rq_queued(p))
+ rq = move_queued_task(rq, rf, p, dest_cpu);
+
+ p->migration_pending = NULL;
+ complete = true;
+ }
+ task_rq_unlock(rq, p, rf);
+
+do_complete:
+ if (complete)
+ complete_all(&pending->done);
+ }
+
+ wait_for_completion(&pending->done);
+
+ if (refcount_dec_and_test(&pending->refs))
+ wake_up_var(&pending->refs);
+
+ /*
+ * Block the original owner of &pending until all subsequent callers
+ * have seen the completion and decremented the refcount
+ */
+ wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
+
+ return 0;
+}
+
/*
* Change a given task's CPU affinity. Migrate the thread to a
* proper CPU and schedule it away if the CPU it's executing on
@@ -1869,7 +2328,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
* call is not atomic; no spinlocks may be held.
*/
static int __set_cpus_allowed_ptr(struct task_struct *p,
- const struct cpumask *new_mask, bool check)
+ const struct cpumask *new_mask,
+ u32 flags)
{
const struct cpumask *cpu_valid_mask = cpu_active_mask;
unsigned int dest_cpu;
@@ -1880,9 +2340,16 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
rq = task_rq_lock(p, &rf);
update_rq_clock(rq);
- if (p->flags & PF_KTHREAD) {
+ if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
/*
- * Kernel threads are allowed on online && !active CPUs
+ * Kernel threads are allowed on online && !active CPUs,
+ * however, during cpu-hot-unplug, even these might get pushed
+ * away if not KTHREAD_IS_PER_CPU.
+ *
+ * Specifically, migration_disabled() tasks must not fail the
+ * cpumask_any_and_distribute() pick below, esp. so on
+ * SCA_MIGRATE_ENABLE, otherwise we'll not call
+ * set_cpus_allowed_common() and actually reset p->cpus_ptr.
*/
cpu_valid_mask = cpu_online_mask;
}
@@ -1891,13 +2358,22 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
* Must re-check here, to close a race against __kthread_bind(),
* sched_setaffinity() is not guaranteed to observe the flag.
*/
- if (check && (p->flags & PF_NO_SETAFFINITY)) {
+ if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
ret = -EINVAL;
goto out;
}
- if (cpumask_equal(&p->cpus_mask, new_mask))
- goto out;
+ if (!(flags & SCA_MIGRATE_ENABLE)) {
+ if (cpumask_equal(&p->cpus_mask, new_mask))
+ goto out;
+
+ if (WARN_ON_ONCE(p == current &&
+ is_migration_disabled(p) &&
+ !cpumask_test_cpu(task_cpu(p), new_mask))) {
+ ret = -EBUSY;
+ goto out;
+ }
+ }
/*
* Picking a ~random cpu helps in cases where we are changing affinity
@@ -1910,35 +2386,10 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
goto out;
}
- do_set_cpus_allowed(p, new_mask);
-
- if (p->flags & PF_KTHREAD) {
- /*
- * For kernel threads that do indeed end up on online &&
- * !active we want to ensure they are strict per-CPU threads.
- */
- WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
- !cpumask_intersects(new_mask, cpu_active_mask) &&
- p->nr_cpus_allowed != 1);
- }
+ __do_set_cpus_allowed(p, new_mask, flags);
- /* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), new_mask))
- goto out;
+ return affine_move_task(rq, p, &rf, dest_cpu, flags);
- if (task_running(rq, p) || p->state == TASK_WAKING) {
- struct migration_arg arg = { p, dest_cpu };
- /* Need help from migration thread: drop lock and wait. */
- task_rq_unlock(rq, p, &rf);
- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
- return 0;
- } else if (task_on_rq_queued(p)) {
- /*
- * OK, since we're going to drop the lock immediately
- * afterwards anyway.
- */
- rq = move_queued_task(rq, &rf, p, dest_cpu);
- }
out:
task_rq_unlock(rq, p, &rf);
@@ -1947,7 +2398,7 @@ out:
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
- return __set_cpus_allowed_ptr(p, new_mask, false);
+ return __set_cpus_allowed_ptr(p, new_mask, 0);
}
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
@@ -1988,6 +2439,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* Clearly, migrating tasks to offline CPUs is a fairly daft thing.
*/
WARN_ON_ONCE(!cpu_online(new_cpu));
+
+ WARN_ON_ONCE(is_migration_disabled(p));
#endif
trace_sched_migrate_task(p, new_cpu);
@@ -2318,6 +2771,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
}
fallthrough;
case possible:
+ /*
+ * XXX When called from select_task_rq() we only
+ * hold p->pi_lock and again violate locking order.
+ *
+ * More yuck to audit.
+ */
do_set_cpus_allowed(p, cpu_possible_mask);
state = fail;
break;
@@ -2348,12 +2807,12 @@ out:
* The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
*/
static inline
-int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int wake_flags)
{
lockdep_assert_held(&p->pi_lock);
- if (p->nr_cpus_allowed > 1)
- cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+ if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
+ cpu = p->sched_class->select_task_rq(p, cpu, wake_flags);
else
cpu = cpumask_any(p->cpus_ptr);
@@ -2375,6 +2834,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
void sched_set_stop_task(int cpu, struct task_struct *stop)
{
+ static struct lock_class_key stop_pi_lock;
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
struct task_struct *old_stop = cpu_rq(cpu)->stop;
@@ -2390,6 +2850,20 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
stop->sched_class = &stop_sched_class;
+
+ /*
+ * The PI code calls rt_mutex_setprio() with ->pi_lock held to
+ * adjust the effective priority of a task. As a result,
+ * rt_mutex_setprio() can trigger (RT) balancing operations,
+ * which can then trigger wakeups of the stop thread to push
+ * around the current task.
+ *
+ * The stop task itself will never be part of the PI-chain, it
+ * never blocks, therefore that ->pi_lock recursion is safe.
+ * Tell lockdep about this by placing the stop->pi_lock in its
+ * own class.
+ */
+ lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
}
cpu_rq(cpu)->stop = stop;
@@ -2403,15 +2877,23 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
}
}
-#else
+#else /* CONFIG_SMP */
static inline int __set_cpus_allowed_ptr(struct task_struct *p,
- const struct cpumask *new_mask, bool check)
+ const struct cpumask *new_mask,
+ u32 flags)
{
return set_cpus_allowed_ptr(p, new_mask);
}
-#endif /* CONFIG_SMP */
+static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
+
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+ return false;
+}
+
+#endif /* !CONFIG_SMP */
static void
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
@@ -2465,7 +2947,7 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
- * Our task @p is fully woken up and running; so its safe to
+ * Our task @p is fully woken up and running; so it's safe to
* drop the rq->lock, hereafter rq is only used for statistics.
*/
rq_unpin_lock(rq, rf);
@@ -2647,6 +3129,13 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
static inline bool ttwu_queue_cond(int cpu, int wake_flags)
{
/*
+ * Do not complicate things with the async wake_list while the CPU is
+ * in hotplug state.
+ */
+ if (!cpu_active(cpu))
+ return false;
+
+ /*
* If the CPU does not share cache, then queue the task on the
* remote rqs wakelist to avoid accessing remote data.
*/
@@ -2952,7 +3441,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
/*
* If the owning (remote) CPU is still in the middle of schedule() with
- * this task as prev, wait until its done referencing the task.
+ * this task as prev, wait until it's done referencing the task.
*
* Pairs with the smp_store_release() in finish_task().
*
@@ -2961,7 +3450,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
smp_cond_load_acquire(&p->on_cpu, !VAL);
- cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
+ cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
if (task_cpu(p) != cpu) {
if (p->in_iowait) {
delayacct_blkio_end(p);
@@ -3103,6 +3592,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
init_numa_balancing(clone_flags, p);
#ifdef CONFIG_SMP
p->wake_entry.u_flags = CSD_TYPE_TTWU;
+ p->migration_pending = NULL;
#endif
}
@@ -3349,7 +3839,7 @@ void wake_up_new_task(struct task_struct *p)
*/
p->recent_used_cpu = task_cpu(p);
rseq_migrate(p);
- __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK));
#endif
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
@@ -3361,7 +3851,7 @@ void wake_up_new_task(struct task_struct *p)
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
- * Nothing relies on rq->lock after this, so its fine to
+ * Nothing relies on rq->lock after this, so it's fine to
* drop it.
*/
rq_unpin_lock(rq, &rf);
@@ -3490,6 +3980,76 @@ static inline void finish_task(struct task_struct *prev)
#endif
}
+#ifdef CONFIG_SMP
+
+static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+ void (*func)(struct rq *rq);
+ struct callback_head *next;
+
+ lockdep_assert_held(&rq->lock);
+
+ while (head) {
+ func = (void (*)(struct rq *))head->func;
+ next = head->next;
+ head->next = NULL;
+ head = next;
+
+ func(rq);
+ }
+}
+
+static void balance_push(struct rq *rq);
+
+struct callback_head balance_push_callback = {
+ .next = NULL,
+ .func = (void (*)(struct callback_head *))balance_push,
+};
+
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+{
+ struct callback_head *head = rq->balance_callback;
+
+ lockdep_assert_held(&rq->lock);
+ if (head)
+ rq->balance_callback = NULL;
+
+ return head;
+}
+
+static void __balance_callbacks(struct rq *rq)
+{
+ do_balance_callbacks(rq, splice_balance_callbacks(rq));
+}
+
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+ unsigned long flags;
+
+ if (unlikely(head)) {
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ do_balance_callbacks(rq, head);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
+}
+
+#else
+
+static inline void __balance_callbacks(struct rq *rq)
+{
+}
+
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+{
+ return NULL;
+}
+
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+}
+
+#endif
+
static inline void
prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
{
@@ -3515,6 +4075,7 @@ static inline void finish_lock_switch(struct rq *rq)
* prev into current:
*/
spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+ __balance_callbacks(rq);
raw_spin_unlock_irq(&rq->lock);
}
@@ -3530,6 +4091,22 @@ static inline void finish_lock_switch(struct rq *rq)
# define finish_arch_post_lock_switch() do { } while (0)
#endif
+static inline void kmap_local_sched_out(void)
+{
+#ifdef CONFIG_KMAP_LOCAL
+ if (unlikely(current->kmap_ctrl.idx))
+ __kmap_local_sched_out();
+#endif
+}
+
+static inline void kmap_local_sched_in(void)
+{
+#ifdef CONFIG_KMAP_LOCAL
+ if (unlikely(current->kmap_ctrl.idx))
+ __kmap_local_sched_in();
+#endif
+}
+
/**
* prepare_task_switch - prepare to switch tasks
* @rq: the runqueue preparing to switch
@@ -3552,6 +4129,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
perf_event_task_sched_out(prev, next);
rseq_preempt(prev);
fire_sched_out_preempt_notifiers(prev, next);
+ kmap_local_sched_out();
prepare_task(next);
prepare_arch_switch(next);
}
@@ -3618,6 +4196,14 @@ static struct rq *finish_task_switch(struct task_struct *prev)
finish_lock_switch(rq);
finish_arch_post_lock_switch();
kcov_finish_switch(current);
+ /*
+ * kmap_local_sched_out() is invoked with rq::lock held and
+ * interrupts disabled. There is no requirement for that, but the
+ * sched out code does not have an interrupt enabled section.
+ * Restoring the maps on sched in does not require interrupts being
+ * disabled either.
+ */
+ kmap_local_sched_in();
fire_sched_in_preempt_notifiers(current);
/*
@@ -3656,43 +4242,6 @@ static struct rq *finish_task_switch(struct task_struct *prev)
return rq;
}
-#ifdef CONFIG_SMP
-
-/* rq->lock is NOT held, but preemption is disabled */
-static void __balance_callback(struct rq *rq)
-{
- struct callback_head *head, *next;
- void (*func)(struct rq *rq);
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- head = rq->balance_callback;
- rq->balance_callback = NULL;
- while (head) {
- func = (void (*)(struct rq *))head->func;
- next = head->next;
- head->next = NULL;
- head = next;
-
- func(rq);
- }
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-static inline void balance_callback(struct rq *rq)
-{
- if (unlikely(rq->balance_callback))
- __balance_callback(rq);
-}
-
-#else
-
-static inline void balance_callback(struct rq *rq)
-{
-}
-
-#endif
-
/**
* schedule_tail - first thing a freshly forked thread must call.
* @prev: the thread we just switched away from.
@@ -3712,7 +4261,6 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
*/
rq = finish_task_switch(prev);
- balance_callback(rq);
preempt_enable();
if (current->set_child_tid)
@@ -3841,7 +4389,7 @@ unsigned long nr_iowait_cpu(int cpu)
}
/*
- * IO-wait accounting, and how its mostly bollocks (on SMP).
+ * IO-wait accounting, and how it's mostly bollocks (on SMP).
*
* The idea behind IO-wait account is to account the idle time that we could
* have spend running if it were not for IO. That is, if we were to improve the
@@ -3893,7 +4441,7 @@ void sched_exec(void)
int dest_cpu;
raw_spin_lock_irqsave(&p->pi_lock, flags);
- dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
+ dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
if (dest_cpu == smp_processor_id())
goto unlock;
@@ -4296,6 +4844,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
preempt_count_set(PREEMPT_DISABLED);
}
rcu_sleep_check();
+ SCHED_WARN_ON(ct_state() == CONTEXT_USER);
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -4336,7 +4885,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/*
* Optimization: we know that if all tasks are in the fair class we can
* call that function directly, but only if the @prev task wasn't of a
- * higher scheduling class, because otherwise those loose the
+ * higher scheduling class, because otherwise those lose the
* opportunity to pull in more work from other CPUs.
*/
if (likely(prev->sched_class <= &fair_sched_class &&
@@ -4520,6 +5069,7 @@ static void __sched notrace __schedule(bool preempt)
*/
++*switch_count;
+ migrate_disable_switch(rq, prev);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
trace_sched_switch(preempt, prev, next);
@@ -4528,10 +5078,11 @@ static void __sched notrace __schedule(bool preempt)
rq = context_switch(rq, prev, next, &rf);
} else {
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
- rq_unlock_irq(rq, &rf);
- }
- balance_callback(rq);
+ rq_unpin_lock(rq, &rf);
+ __balance_callbacks(rq);
+ raw_spin_unlock_irq(&rq->lock);
+ }
}
void __noreturn do_task_dead(void)
@@ -4635,7 +5186,7 @@ void __sched schedule_idle(void)
} while (need_resched());
}
-#ifdef CONFIG_CONTEXT_TRACKING
+#if defined(CONFIG_CONTEXT_TRACKING) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK)
asmlinkage __visible void __sched schedule_user(void)
{
/*
@@ -4857,7 +5408,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
* right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
* ensure a task is de-boosted (pi_task is set to NULL) before the
* task is allowed to run again (and can exit). This ensures the pointer
- * points to a blocked task -- which guaratees the task is present.
+ * points to a blocked task -- which guarantees the task is present.
*/
p->pi_top_task = pi_task;
@@ -4943,9 +5494,11 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
out_unlock:
/* Avoid rq from going away on us: */
preempt_disable();
- __task_rq_unlock(rq, &rf);
- balance_callback(rq);
+ rq_unpin_lock(rq, &rf);
+ __balance_callbacks(rq);
+ raw_spin_unlock(&rq->lock);
+
preempt_enable();
}
#else
@@ -4974,7 +5527,7 @@ void set_user_nice(struct task_struct *p, long nice)
/*
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
- * it wont have any effect on scheduling until the task is
+ * it won't have any effect on scheduling until the task is
* SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
*/
if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
@@ -5219,6 +5772,7 @@ static int __sched_setscheduler(struct task_struct *p,
int retval, oldprio, oldpolicy = -1, queued, running;
int new_effective_prio, policy = attr->sched_policy;
const struct sched_class *prev_class;
+ struct callback_head *head;
struct rq_flags rf;
int reset_on_fork;
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
@@ -5457,6 +6011,7 @@ change:
/* Avoid rq from going away on us: */
preempt_disable();
+ head = splice_balance_callbacks(rq);
task_rq_unlock(rq, p, &rf);
if (pi) {
@@ -5465,7 +6020,7 @@ change:
}
/* Run balance callbacks after we've adjusted the PI chain: */
- balance_callback(rq);
+ balance_callbacks(rq, head);
preempt_enable();
return 0;
@@ -5960,7 +6515,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
}
#endif
again:
- retval = __set_cpus_allowed_ptr(p, new_mask, true);
+ retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
if (!retval) {
cpuset_cpus_allowed(p, cpus_allowed);
@@ -6082,14 +6637,6 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
return ret;
}
-/**
- * sys_sched_yield - yield the current processor to other threads.
- *
- * This function yields the current CPU to other tasks. If there are no
- * other threads running on this CPU then this function will return.
- *
- * Return: 0.
- */
static void do_sched_yield(void)
{
struct rq_flags rf;
@@ -6100,17 +6647,21 @@ static void do_sched_yield(void)
schedstat_inc(rq->yld_count);
current->sched_class->yield_task(rq);
- /*
- * Since we are going to call schedule() anyway, there's
- * no need to preempt or enable interrupts:
- */
preempt_disable();
- rq_unlock(rq, &rf);
+ rq_unlock_irq(rq, &rf);
sched_preempt_enable_no_resched();
schedule();
}
+/**
+ * sys_sched_yield - yield the current processor to other threads.
+ *
+ * This function yields the current CPU to other tasks. If there are no
+ * other threads running on this CPU then this function will return.
+ *
+ * Return: 0.
+ */
SYSCALL_DEFINE0(sched_yield)
{
do_sched_yield();
@@ -6165,7 +6716,7 @@ EXPORT_SYMBOL(__cond_resched_lock);
*
* The scheduler is at all times free to pick the calling task as the most
* eligible task to run, if removing the yield() call from your code breaks
- * it, its already broken.
+ * it, it's already broken.
*
* Typical broken usage is:
*
@@ -6453,6 +7004,7 @@ void sched_show_task(struct task_struct *p)
(unsigned long)task_thread_info(p)->flags);
print_worker_info(KERN_INFO, p);
+ print_stop_info(KERN_INFO, p);
show_stack(p, NULL, KERN_INFO);
put_task_stack(p);
}
@@ -6538,12 +7090,12 @@ void init_idle(struct task_struct *idle, int cpu)
#ifdef CONFIG_SMP
/*
- * Its possible that init_idle() gets called multiple times on a task,
+ * It's possible that init_idle() gets called multiple times on a task,
* in that case do_set_cpus_allowed() will not do the right thing.
*
* And since this is boot we can forgo the serialization.
*/
- set_cpus_allowed_common(idle, cpumask_of(cpu));
+ set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
#endif
/*
* We're having a chicken and egg problem, even though we are
@@ -6694,119 +7246,139 @@ void idle_task_exit(void)
/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
}
-/*
- * Since this CPU is going 'away' for a while, fold any nr_active delta
- * we might have. Assumes we're called after migrate_tasks() so that the
- * nr_active count is stable. We need to take the teardown thread which
- * is calling this into account, so we hand in adjust = 1 to the load
- * calculation.
- *
- * Also see the comment "Global load-average calculations".
- */
-static void calc_load_migrate(struct rq *rq)
+static int __balance_push_cpu_stop(void *arg)
{
- long delta = calc_load_fold_active(rq, 1);
- if (delta)
- atomic_long_add(delta, &calc_load_tasks);
-}
+ struct task_struct *p = arg;
+ struct rq *rq = this_rq();
+ struct rq_flags rf;
+ int cpu;
-static struct task_struct *__pick_migrate_task(struct rq *rq)
-{
- const struct sched_class *class;
- struct task_struct *next;
+ raw_spin_lock_irq(&p->pi_lock);
+ rq_lock(rq, &rf);
- for_each_class(class) {
- next = class->pick_next_task(rq);
- if (next) {
- next->sched_class->put_prev_task(rq, next);
- return next;
- }
+ update_rq_clock(rq);
+
+ if (task_rq(p) == rq && task_on_rq_queued(p)) {
+ cpu = select_fallback_rq(rq->cpu, p);
+ rq = __migrate_task(rq, &rf, p, cpu);
}
- /* The idle class should always have a runnable task */
- BUG();
+ rq_unlock(rq, &rf);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ put_task_struct(p);
+
+ return 0;
}
+static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
+
/*
- * Migrate all tasks from the rq, sleeping tasks will be migrated by
- * try_to_wake_up()->select_task_rq().
- *
- * Called with rq->lock held even though we'er in stop_machine() and
- * there's no concurrency possible, we hold the required locks anyway
- * because of lock validation efforts.
+ * Ensure we only run per-cpu kthreads once the CPU goes !active.
*/
-static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
+static void balance_push(struct rq *rq)
{
- struct rq *rq = dead_rq;
- struct task_struct *next, *stop = rq->stop;
- struct rq_flags orf = *rf;
- int dest_cpu;
+ struct task_struct *push_task = rq->curr;
+ lockdep_assert_held(&rq->lock);
+ SCHED_WARN_ON(rq->cpu != smp_processor_id());
/*
- * Fudge the rq selection such that the below task selection loop
- * doesn't get stuck on the currently eligible stop task.
- *
- * We're currently inside stop_machine() and the rq is either stuck
- * in the stop_machine_cpu_stop() loop, or we're executing this code,
- * either way we should never end up calling schedule() until we're
- * done here.
+ * Ensure the thing is persistent until balance_push_set(.on = false);
*/
- rq->stop = NULL;
+ rq->balance_callback = &balance_push_callback;
/*
- * put_prev_task() and pick_next_task() sched
- * class method both need to have an up-to-date
- * value of rq->clock[_task]
+ * Both the cpu-hotplug and stop task are in this case and are
+ * required to complete the hotplug process.
+ *
+ * XXX: the idle task does not match kthread_is_per_cpu() due to
+ * histerical raisins.
*/
- update_rq_clock(rq);
-
- for (;;) {
- /*
- * There's this thread running, bail when that's the only
- * remaining thread:
- */
- if (rq->nr_running == 1)
- break;
-
- next = __pick_migrate_task(rq);
+ if (rq->idle == push_task ||
+ ((push_task->flags & PF_KTHREAD) && kthread_is_per_cpu(push_task)) ||
+ is_migration_disabled(push_task)) {
/*
- * Rules for changing task_struct::cpus_mask are holding
- * both pi_lock and rq->lock, such that holding either
- * stabilizes the mask.
+ * If this is the idle task on the outgoing CPU try to wake
+ * up the hotplug control thread which might wait for the
+ * last task to vanish. The rcuwait_active() check is
+ * accurate here because the waiter is pinned on this CPU
+ * and can't obviously be running in parallel.
*
- * Drop rq->lock is not quite as disastrous as it usually is
- * because !cpu_active at this point, which means load-balance
- * will not interfere. Also, stop-machine.
+ * On RT kernels this also has to check whether there are
+ * pinned and scheduled out tasks on the runqueue. They
+ * need to leave the migrate disabled section first.
*/
- rq_unlock(rq, rf);
- raw_spin_lock(&next->pi_lock);
- rq_relock(rq, rf);
-
- /*
- * Since we're inside stop-machine, _nothing_ should have
- * changed the task, WARN if weird stuff happened, because in
- * that case the above rq->lock drop is a fail too.
- */
- if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
- raw_spin_unlock(&next->pi_lock);
- continue;
+ if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
+ rcuwait_active(&rq->hotplug_wait)) {
+ raw_spin_unlock(&rq->lock);
+ rcuwait_wake_up(&rq->hotplug_wait);
+ raw_spin_lock(&rq->lock);
}
+ return;
+ }
- /* Find suitable destination for @next, with force if needed. */
- dest_cpu = select_fallback_rq(dead_rq->cpu, next);
- rq = __migrate_task(rq, rf, next, dest_cpu);
- if (rq != dead_rq) {
- rq_unlock(rq, rf);
- rq = dead_rq;
- *rf = orf;
- rq_relock(rq, rf);
- }
- raw_spin_unlock(&next->pi_lock);
+ get_task_struct(push_task);
+ /*
+ * Temporarily drop rq->lock such that we can wake-up the stop task.
+ * Both preemption and IRQs are still disabled.
+ */
+ raw_spin_unlock(&rq->lock);
+ stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
+ this_cpu_ptr(&push_work));
+ /*
+ * At this point need_resched() is true and we'll take the loop in
+ * schedule(). The next pick is obviously going to be the stop task
+ * which kthread_is_per_cpu() and will push this task away.
+ */
+ raw_spin_lock(&rq->lock);
+}
+
+static void balance_push_set(int cpu, bool on)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+ rq->balance_push = on;
+ if (on) {
+ WARN_ON_ONCE(rq->balance_callback);
+ rq->balance_callback = &balance_push_callback;
+ } else if (rq->balance_callback == &balance_push_callback) {
+ rq->balance_callback = NULL;
}
+ rq_unlock_irqrestore(rq, &rf);
+}
- rq->stop = stop;
+/*
+ * Invoked from a CPUs hotplug control thread after the CPU has been marked
+ * inactive. All tasks which are not per CPU kernel threads are either
+ * pushed off this CPU now via balance_push() or placed on a different CPU
+ * during wakeup. Wait until the CPU is quiescent.
+ */
+static void balance_hotplug_wait(void)
+{
+ struct rq *rq = this_rq();
+
+ rcuwait_wait_event(&rq->hotplug_wait,
+ rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
+ TASK_UNINTERRUPTIBLE);
}
+
+#else
+
+static inline void balance_push(struct rq *rq)
+{
+}
+
+static inline void balance_push_set(int cpu, bool on)
+{
+}
+
+static inline void balance_hotplug_wait(void)
+{
+}
+
#endif /* CONFIG_HOTPLUG_CPU */
void set_rq_online(struct rq *rq)
@@ -6892,6 +7464,12 @@ int sched_cpu_activate(unsigned int cpu)
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
+ /*
+ * Make sure that when the hotplug state machine does a roll-back
+ * we clear balance_push. Ideally that would happen earlier...
+ */
+ balance_push_set(cpu, false);
+
#ifdef CONFIG_SCHED_SMT
/*
* When going up, increment the number of cores with SMT present.
@@ -6927,18 +7505,40 @@ int sched_cpu_activate(unsigned int cpu)
int sched_cpu_deactivate(unsigned int cpu)
{
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
int ret;
set_cpu_active(cpu, false);
+
+ /*
+ * From this point forward, this CPU will refuse to run any task that
+ * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively
+ * push those tasks away until this gets cleared, see
+ * sched_cpu_dying().
+ */
+ balance_push_set(cpu, true);
+
/*
- * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
- * users of this state to go away such that all new such users will
- * observe it.
+ * We've cleared cpu_active_mask / set balance_push, wait for all
+ * preempt-disabled and RCU users of this state to go away such that
+ * all new such users will observe it.
+ *
+ * Specifically, we rely on ttwu to no longer target this CPU, see
+ * ttwu_queue_cond() and is_cpu_allowed().
*
* Do sync before park smpboot threads to take care the rcu boost case.
*/
synchronize_rcu();
+ rq_lock_irqsave(rq, &rf);
+ if (rq->rd) {
+ update_rq_clock(rq);
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_offline(rq);
+ }
+ rq_unlock_irqrestore(rq, &rf);
+
#ifdef CONFIG_SCHED_SMT
/*
* When going down, decrement the number of cores with SMT present.
@@ -6952,6 +7552,7 @@ int sched_cpu_deactivate(unsigned int cpu)
ret = cpuset_cpu_inactive(cpu);
if (ret) {
+ balance_push_set(cpu, false);
set_cpu_active(cpu, true);
return ret;
}
@@ -6975,6 +7576,60 @@ int sched_cpu_starting(unsigned int cpu)
}
#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Invoked immediately before the stopper thread is invoked to bring the
+ * CPU down completely. At this point all per CPU kthreads except the
+ * hotplug thread (current) and the stopper thread (inactive) have been
+ * either parked or have been unbound from the outgoing CPU. Ensure that
+ * any of those which might be on the way out are gone.
+ *
+ * If after this point a bound task is being woken on this CPU then the
+ * responsible hotplug callback has failed to do it's job.
+ * sched_cpu_dying() will catch it with the appropriate fireworks.
+ */
+int sched_cpu_wait_empty(unsigned int cpu)
+{
+ balance_hotplug_wait();
+ return 0;
+}
+
+/*
+ * Since this CPU is going 'away' for a while, fold any nr_active delta we
+ * might have. Called from the CPU stopper task after ensuring that the
+ * stopper is the last running task on the CPU, so nr_active count is
+ * stable. We need to take the teardown thread which is calling this into
+ * account, so we hand in adjust = 1 to the load calculation.
+ *
+ * Also see the comment "Global load-average calculations".
+ */
+static void calc_load_migrate(struct rq *rq)
+{
+ long delta = calc_load_fold_active(rq, 1);
+
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+}
+
+static void dump_rq_tasks(struct rq *rq, const char *loglvl)
+{
+ struct task_struct *g, *p;
+ int cpu = cpu_of(rq);
+
+ lockdep_assert_held(&rq->lock);
+
+ printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);
+ for_each_process_thread(g, p) {
+ if (task_cpu(p) != cpu)
+ continue;
+
+ if (!task_on_rq_queued(p))
+ continue;
+
+ printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm);
+ }
+}
+
int sched_cpu_dying(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -6984,14 +7639,18 @@ int sched_cpu_dying(unsigned int cpu)
sched_tick_stop(cpu);
rq_lock_irqsave(rq, &rf);
- if (rq->rd) {
- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
- set_rq_offline(rq);
+ if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
+ WARN(true, "Dying CPU not properly vacated!");
+ dump_rq_tasks(rq, KERN_WARNING);
}
- migrate_tasks(rq, &rf);
- BUG_ON(rq->nr_running != 1);
rq_unlock_irqrestore(rq, &rf);
+ /*
+ * Now that the CPU is offline, make sure we're welcome
+ * to new tasks once we come back up.
+ */
+ balance_push_set(cpu, false);
+
calc_load_migrate(rq);
update_max_interval();
nohz_balance_exit_idle(rq);
@@ -7194,7 +7853,10 @@ void __init sched_init(void)
rq->last_blocked_load_update_tick = jiffies;
atomic_set(&rq->nohz_flags, 0);
- rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
+ INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
+#endif
+#ifdef CONFIG_HOTPLUG_CPU
+ rcuwait_init(&rq->hotplug_wait);
#endif
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
@@ -7333,6 +7995,39 @@ void __cant_sleep(const char *file, int line, int preempt_offset)
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
EXPORT_SYMBOL_GPL(__cant_sleep);
+
+#ifdef CONFIG_SMP
+void __cant_migrate(const char *file, int line)
+{
+ static unsigned long prev_jiffy;
+
+ if (irqs_disabled())
+ return;
+
+ if (is_migration_disabled(current))
+ return;
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+ return;
+
+ if (preempt_count() > 0)
+ return;
+
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+ return;
+ prev_jiffy = jiffies;
+
+ pr_err("BUG: assuming non migratable context at %s:%d\n", file, line);
+ pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(), is_migration_disabled(current),
+ current->pid, current->comm);
+
+ debug_show_held_locks(current);
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+EXPORT_SYMBOL_GPL(__cant_migrate);
+#endif
#endif
#ifdef CONFIG_MAGIC_SYSRQ
@@ -7666,7 +8361,7 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
return -EINVAL;
#endif
/*
- * Serialize against wake_up_new_task() such that if its
+ * Serialize against wake_up_new_task() such that if it's
* running, we're sure to observe its full state.
*/
raw_spin_lock_irq(&task->pi_lock);