diff options
| -rw-r--r-- | include/linux/cpuset.h | 8 | ||||
| -rw-r--r-- | include/linux/mmu_context.h | 14 | ||||
| -rw-r--r-- | include/linux/sched.h | 21 | ||||
| -rw-r--r-- | include/linux/sched/sysctl.h | 18 | ||||
| -rw-r--r-- | include/linux/wait.h | 2 | ||||
| -rw-r--r-- | init/init_task.c | 1 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset.c | 59 | ||||
| -rw-r--r-- | kernel/fork.c | 2 | ||||
| -rw-r--r-- | kernel/sched/core.c | 440 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 8 | ||||
| -rw-r--r-- | kernel/sched/debug.c | 10 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 211 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 26 | ||||
| -rw-r--r-- | kernel/sched/topology.c | 65 | 
14 files changed, 714 insertions, 171 deletions
| diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 04c20de66afc..d2b9c41c8edf 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -15,6 +15,7 @@  #include <linux/cpumask.h>  #include <linux/nodemask.h>  #include <linux/mm.h> +#include <linux/mmu_context.h>  #include <linux/jump_label.h>  #ifdef CONFIG_CPUSETS @@ -58,7 +59,7 @@ extern void cpuset_wait_for_hotplug(void);  extern void cpuset_read_lock(void);  extern void cpuset_read_unlock(void);  extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); -extern void cpuset_cpus_allowed_fallback(struct task_struct *p); +extern bool cpuset_cpus_allowed_fallback(struct task_struct *p);  extern nodemask_t cpuset_mems_allowed(struct task_struct *p);  #define cpuset_current_mems_allowed (current->mems_allowed)  void cpuset_init_current_mems_allowed(void); @@ -184,11 +185,12 @@ static inline void cpuset_read_unlock(void) { }  static inline void cpuset_cpus_allowed(struct task_struct *p,  				       struct cpumask *mask)  { -	cpumask_copy(mask, cpu_possible_mask); +	cpumask_copy(mask, task_cpu_possible_mask(p));  } -static inline void cpuset_cpus_allowed_fallback(struct task_struct *p) +static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p)  { +	return false;  }  static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h index 03dee12d2b61..b9b970f7ab45 100644 --- a/include/linux/mmu_context.h +++ b/include/linux/mmu_context.h @@ -14,4 +14,18 @@  static inline void leave_mm(int cpu) { }  #endif +/* + * CPUs that are capable of running user task @p. Must contain at least one + * active CPU. It is assumed that the kernel can run on all CPUs, so calling + * this for a kernel thread is pointless. + * + * By default, we assume a sane, homogeneous system. + */ +#ifndef task_cpu_possible_mask +# define task_cpu_possible_mask(p)	cpu_possible_mask +# define task_cpu_possible(cpu, p)	true +#else +# define task_cpu_possible(cpu, p)	cpumask_test_cpu((cpu), task_cpu_possible_mask(p)) +#endif +  #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index ec8d07d88641..6ecd02e2ca1e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -748,6 +748,7 @@ struct task_struct {  	unsigned int			policy;  	int				nr_cpus_allowed;  	const cpumask_t			*cpus_ptr; +	cpumask_t			*user_cpus_ptr;  	cpumask_t			cpus_mask;  	void				*migration_pending;  #ifdef CONFIG_SMP @@ -1705,6 +1706,11 @@ extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_  #ifdef CONFIG_SMP  extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);  extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); +extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node); +extern void release_user_cpus_ptr(struct task_struct *p); +extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask); +extern void force_compatible_cpus_allowed_ptr(struct task_struct *p); +extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p);  #else  static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)  { @@ -1715,6 +1721,21 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpuma  		return -EINVAL;  	return 0;  } +static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node) +{ +	if (src->user_cpus_ptr) +		return -EINVAL; +	return 0; +} +static inline void release_user_cpus_ptr(struct task_struct *p) +{ +	WARN_ON(p->user_cpus_ptr); +} + +static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) +{ +	return 0; +}  #endif  extern int yield_to(struct task_struct *p, bool preempt); diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index db2c0f34aaaf..304f431178fd 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -28,30 +28,12 @@ enum { sysctl_hung_task_timeout_secs = 0 };  extern unsigned int sysctl_sched_child_runs_first; -extern unsigned int sysctl_sched_latency; -extern unsigned int sysctl_sched_min_granularity; -extern unsigned int sysctl_sched_wakeup_granularity; -  enum sched_tunable_scaling {  	SCHED_TUNABLESCALING_NONE,  	SCHED_TUNABLESCALING_LOG,  	SCHED_TUNABLESCALING_LINEAR,  	SCHED_TUNABLESCALING_END,  }; -extern unsigned int sysctl_sched_tunable_scaling; - -extern unsigned int sysctl_numa_balancing_scan_delay; -extern unsigned int sysctl_numa_balancing_scan_period_min; -extern unsigned int sysctl_numa_balancing_scan_period_max; -extern unsigned int sysctl_numa_balancing_scan_size; - -#ifdef CONFIG_SCHED_DEBUG -extern __read_mostly unsigned int sysctl_sched_migration_cost; -extern __read_mostly unsigned int sysctl_sched_nr_migrate; - -extern int sysctl_resched_latency_warn_ms; -extern int sysctl_resched_latency_warn_once; -#endif  /*   *  control realtime throttling: diff --git a/include/linux/wait.h b/include/linux/wait.h index 6598ae35e1b5..93dab0e9580f 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -56,7 +56,7 @@ struct task_struct;  #define __WAIT_QUEUE_HEAD_INITIALIZER(name) {					\  	.lock		= __SPIN_LOCK_UNLOCKED(name.lock),			\ -	.head		= { &(name).head, &(name).head } } +	.head		= LIST_HEAD_INIT(name.head) }  #define DECLARE_WAIT_QUEUE_HEAD(name) \  	struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name) diff --git a/init/init_task.c b/init/init_task.c index 562f2ef8d157..2d024066e27b 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -80,6 +80,7 @@ struct task_struct init_task  	.normal_prio	= MAX_PRIO - 20,  	.policy		= SCHED_NORMAL,  	.cpus_ptr	= &init_task.cpus_mask, +	.user_cpus_ptr	= NULL,  	.cpus_mask	= CPU_MASK_ALL,  	.nr_cpus_allowed= NR_CPUS,  	.mm		= NULL, diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index adb5190c4429..6500cbe0ce16 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -372,18 +372,29 @@ static inline bool is_in_v2_mode(void)  }  /* - * Return in pmask the portion of a cpusets's cpus_allowed that - * are online.  If none are online, walk up the cpuset hierarchy - * until we find one that does have some online cpus. + * Return in pmask the portion of a task's cpusets's cpus_allowed that + * are online and are capable of running the task.  If none are found, + * walk up the cpuset hierarchy until we find one that does have some + * appropriate cpus.   *   * One way or another, we guarantee to return some non-empty subset   * of cpu_online_mask.   *   * Call with callback_lock or cpuset_mutex held.   */ -static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) +static void guarantee_online_cpus(struct task_struct *tsk, +				  struct cpumask *pmask)  { -	while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) { +	const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); +	struct cpuset *cs; + +	if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask))) +		cpumask_copy(pmask, cpu_online_mask); + +	rcu_read_lock(); +	cs = task_cs(tsk); + +	while (!cpumask_intersects(cs->effective_cpus, pmask)) {  		cs = parent_cs(cs);  		if (unlikely(!cs)) {  			/* @@ -393,11 +404,13 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)  			 * cpuset's effective_cpus is on its way to be  			 * identical to cpu_online_mask.  			 */ -			cpumask_copy(pmask, cpu_online_mask); -			return; +			goto out_unlock;  		}  	} -	cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); +	cpumask_and(pmask, pmask, cs->effective_cpus); + +out_unlock: +	rcu_read_unlock();  }  /* @@ -2199,15 +2212,13 @@ static void cpuset_attach(struct cgroup_taskset *tset)  	percpu_down_write(&cpuset_rwsem); -	/* prepare for attach */ -	if (cs == &top_cpuset) -		cpumask_copy(cpus_attach, cpu_possible_mask); -	else -		guarantee_online_cpus(cs, cpus_attach); -  	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);  	cgroup_taskset_for_each(task, css, tset) { +		if (cs != &top_cpuset) +			guarantee_online_cpus(task, cpus_attach); +		else +			cpumask_copy(cpus_attach, task_cpu_possible_mask(task));  		/*  		 * can_attach beforehand should guarantee that this doesn't  		 * fail.  TODO: have a better way to handle failure here @@ -3302,9 +3313,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)  	unsigned long flags;  	spin_lock_irqsave(&callback_lock, flags); -	rcu_read_lock(); -	guarantee_online_cpus(task_cs(tsk), pmask); -	rcu_read_unlock(); +	guarantee_online_cpus(tsk, pmask);  	spin_unlock_irqrestore(&callback_lock, flags);  } @@ -3318,13 +3327,22 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)   * which will not contain a sane cpumask during cases such as cpu hotplugging.   * This is the absolute last resort for the scheduler and it is only used if   * _every_ other avenue has been traveled. + * + * Returns true if the affinity of @tsk was changed, false otherwise.   **/ -void cpuset_cpus_allowed_fallback(struct task_struct *tsk) +bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)  { +	const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); +	const struct cpumask *cs_mask; +	bool changed = false; +  	rcu_read_lock(); -	do_set_cpus_allowed(tsk, is_in_v2_mode() ? -		task_cs(tsk)->cpus_allowed : cpu_possible_mask); +	cs_mask = task_cs(tsk)->cpus_allowed; +	if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) { +		do_set_cpus_allowed(tsk, cs_mask); +		changed = true; +	}  	rcu_read_unlock();  	/* @@ -3344,6 +3362,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk)  	 * select_fallback_rq() will fix things ups and set cpu_possible_mask  	 * if required.  	 */ +	return changed;  }  void __init cpuset_init_current_mems_allowed(void) diff --git a/kernel/fork.c b/kernel/fork.c index bc94b2cc5995..bd0e165b8397 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -446,6 +446,7 @@ void put_task_stack(struct task_struct *tsk)  void free_task(struct task_struct *tsk)  { +	release_user_cpus_ptr(tsk);  	scs_release(tsk);  #ifndef CONFIG_THREAD_INFO_IN_TASK @@ -924,6 +925,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)  #endif  	if (orig->cpus_ptr == &orig->cpus_mask)  		tsk->cpus_ptr = &tsk->cpus_mask; +	dup_user_cpus_ptr(tsk, orig, node);  	/*  	 * One for the user space visible state that goes away when reaped. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 20ffcc044134..8dc67166aa6c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -993,6 +993,7 @@ int get_nohz_timer_target(void)  {  	int i, cpu = smp_processor_id(), default_cpu = -1;  	struct sched_domain *sd; +	const struct cpumask *hk_mask;  	if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {  		if (!idle_cpu(cpu)) @@ -1000,10 +1001,11 @@ int get_nohz_timer_target(void)  		default_cpu = cpu;  	} +	hk_mask = housekeeping_cpumask(HK_FLAG_TIMER); +  	rcu_read_lock();  	for_each_domain(cpu, sd) { -		for_each_cpu_and(i, sched_domain_span(sd), -			housekeeping_cpumask(HK_FLAG_TIMER)) { +		for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {  			if (cpu == i)  				continue; @@ -1619,6 +1621,23 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)  		uclamp_rq_dec_id(rq, p, clamp_id);  } +static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p, +				      enum uclamp_id clamp_id) +{ +	if (!p->uclamp[clamp_id].active) +		return; + +	uclamp_rq_dec_id(rq, p, clamp_id); +	uclamp_rq_inc_id(rq, p, clamp_id); + +	/* +	 * Make sure to clear the idle flag if we've transiently reached 0 +	 * active tasks on rq. +	 */ +	if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE)) +		rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; +} +  static inline void  uclamp_update_active(struct task_struct *p)  { @@ -1642,12 +1661,8 @@ uclamp_update_active(struct task_struct *p)  	 * affecting a valid clamp bucket, the next time it's enqueued,  	 * it will already see the updated clamp bucket value.  	 */ -	for_each_clamp_id(clamp_id) { -		if (p->uclamp[clamp_id].active) { -			uclamp_rq_dec_id(rq, p, clamp_id); -			uclamp_rq_inc_id(rq, p, clamp_id); -		} -	} +	for_each_clamp_id(clamp_id) +		uclamp_rq_reinc_id(rq, p, clamp_id);  	task_rq_unlock(rq, p, &rf);  } @@ -2161,7 +2176,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)  	/* Non kernel threads are not allowed during either online or offline. */  	if (!(p->flags & PF_KTHREAD)) -		return cpu_active(cpu); +		return cpu_active(cpu) && task_cpu_possible(cpu, p);  	/* KTHREAD_IS_PER_CPU is always allowed. */  	if (kthread_is_per_cpu(p)) @@ -2468,6 +2483,34 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)  	__do_set_cpus_allowed(p, new_mask, 0);  } +int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, +		      int node) +{ +	if (!src->user_cpus_ptr) +		return 0; + +	dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node); +	if (!dst->user_cpus_ptr) +		return -ENOMEM; + +	cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); +	return 0; +} + +static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p) +{ +	struct cpumask *user_mask = NULL; + +	swap(p->user_cpus_ptr, user_mask); + +	return user_mask; +} + +void release_user_cpus_ptr(struct task_struct *p) +{ +	kfree(clear_user_cpus_ptr(p)); +} +  /*   * This function is wildly self concurrent; here be dragons.   * @@ -2685,28 +2728,26 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag  }  /* - * Change a given task's CPU affinity. Migrate the thread to a - * proper CPU and schedule it away if the CPU it's executing on - * is removed from the allowed bitmask. - * - * NOTE: the caller must have a valid reference to the task, the - * task must not exit() & deallocate itself prematurely. The - * call is not atomic; no spinlocks may be held. + * Called with both p->pi_lock and rq->lock held; drops both before returning.   */ -static int __set_cpus_allowed_ptr(struct task_struct *p, -				  const struct cpumask *new_mask, -				  u32 flags) +static int __set_cpus_allowed_ptr_locked(struct task_struct *p, +					 const struct cpumask *new_mask, +					 u32 flags, +					 struct rq *rq, +					 struct rq_flags *rf) +	__releases(rq->lock) +	__releases(p->pi_lock)  { +	const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);  	const struct cpumask *cpu_valid_mask = cpu_active_mask; +	bool kthread = p->flags & PF_KTHREAD; +	struct cpumask *user_mask = NULL;  	unsigned int dest_cpu; -	struct rq_flags rf; -	struct rq *rq;  	int ret = 0; -	rq = task_rq_lock(p, &rf);  	update_rq_clock(rq); -	if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { +	if (kthread || is_migration_disabled(p)) {  		/*  		 * Kernel threads are allowed on online && !active CPUs,  		 * however, during cpu-hot-unplug, even these might get pushed @@ -2720,6 +2761,11 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,  		cpu_valid_mask = cpu_online_mask;  	} +	if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) { +		ret = -EINVAL; +		goto out; +	} +  	/*  	 * Must re-check here, to close a race against __kthread_bind(),  	 * sched_setaffinity() is not guaranteed to observe the flag. @@ -2754,20 +2800,178 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,  	__do_set_cpus_allowed(p, new_mask, flags); -	return affine_move_task(rq, p, &rf, dest_cpu, flags); +	if (flags & SCA_USER) +		user_mask = clear_user_cpus_ptr(p); + +	ret = affine_move_task(rq, p, rf, dest_cpu, flags); + +	kfree(user_mask); + +	return ret;  out: -	task_rq_unlock(rq, p, &rf); +	task_rq_unlock(rq, p, rf);  	return ret;  } +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +static int __set_cpus_allowed_ptr(struct task_struct *p, +				  const struct cpumask *new_mask, u32 flags) +{ +	struct rq_flags rf; +	struct rq *rq; + +	rq = task_rq_lock(p, &rf); +	return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf); +} +  int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)  {  	return __set_cpus_allowed_ptr(p, new_mask, 0);  }  EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); +/* + * Change a given task's CPU affinity to the intersection of its current + * affinity mask and @subset_mask, writing the resulting mask to @new_mask + * and pointing @p->user_cpus_ptr to a copy of the old mask. + * If the resulting mask is empty, leave the affinity unchanged and return + * -EINVAL. + */ +static int restrict_cpus_allowed_ptr(struct task_struct *p, +				     struct cpumask *new_mask, +				     const struct cpumask *subset_mask) +{ +	struct cpumask *user_mask = NULL; +	struct rq_flags rf; +	struct rq *rq; +	int err; + +	if (!p->user_cpus_ptr) { +		user_mask = kmalloc(cpumask_size(), GFP_KERNEL); +		if (!user_mask) +			return -ENOMEM; +	} + +	rq = task_rq_lock(p, &rf); + +	/* +	 * Forcefully restricting the affinity of a deadline task is +	 * likely to cause problems, so fail and noisily override the +	 * mask entirely. +	 */ +	if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { +		err = -EPERM; +		goto err_unlock; +	} + +	if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) { +		err = -EINVAL; +		goto err_unlock; +	} + +	/* +	 * We're about to butcher the task affinity, so keep track of what +	 * the user asked for in case we're able to restore it later on. +	 */ +	if (user_mask) { +		cpumask_copy(user_mask, p->cpus_ptr); +		p->user_cpus_ptr = user_mask; +	} + +	return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf); + +err_unlock: +	task_rq_unlock(rq, p, &rf); +	kfree(user_mask); +	return err; +} + +/* + * Restrict the CPU affinity of task @p so that it is a subset of + * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the + * old affinity mask. If the resulting mask is empty, we warn and walk + * up the cpuset hierarchy until we find a suitable mask. + */ +void force_compatible_cpus_allowed_ptr(struct task_struct *p) +{ +	cpumask_var_t new_mask; +	const struct cpumask *override_mask = task_cpu_possible_mask(p); + +	alloc_cpumask_var(&new_mask, GFP_KERNEL); + +	/* +	 * __migrate_task() can fail silently in the face of concurrent +	 * offlining of the chosen destination CPU, so take the hotplug +	 * lock to ensure that the migration succeeds. +	 */ +	cpus_read_lock(); +	if (!cpumask_available(new_mask)) +		goto out_set_mask; + +	if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask)) +		goto out_free_mask; + +	/* +	 * We failed to find a valid subset of the affinity mask for the +	 * task, so override it based on its cpuset hierarchy. +	 */ +	cpuset_cpus_allowed(p, new_mask); +	override_mask = new_mask; + +out_set_mask: +	if (printk_ratelimit()) { +		printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n", +				task_pid_nr(p), p->comm, +				cpumask_pr_args(override_mask)); +	} + +	WARN_ON(set_cpus_allowed_ptr(p, override_mask)); +out_free_mask: +	cpus_read_unlock(); +	free_cpumask_var(new_mask); +} + +static int +__sched_setaffinity(struct task_struct *p, const struct cpumask *mask); + +/* + * Restore the affinity of a task @p which was previously restricted by a + * call to force_compatible_cpus_allowed_ptr(). This will clear (and free) + * @p->user_cpus_ptr. + * + * It is the caller's responsibility to serialise this with any calls to + * force_compatible_cpus_allowed_ptr(@p). + */ +void relax_compatible_cpus_allowed_ptr(struct task_struct *p) +{ +	struct cpumask *user_mask = p->user_cpus_ptr; +	unsigned long flags; + +	/* +	 * Try to restore the old affinity mask. If this fails, then +	 * we free the mask explicitly to avoid it being inherited across +	 * a subsequent fork(). +	 */ +	if (!user_mask || !__sched_setaffinity(p, user_mask)) +		return; + +	raw_spin_lock_irqsave(&p->pi_lock, flags); +	user_mask = clear_user_cpus_ptr(p); +	raw_spin_unlock_irqrestore(&p->pi_lock, flags); + +	kfree(user_mask); +} +  void set_task_cpu(struct task_struct *p, unsigned int new_cpu)  {  #ifdef CONFIG_SCHED_DEBUG @@ -3112,9 +3316,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)  		/* Look for allowed, online CPU in same node. */  		for_each_cpu(dest_cpu, nodemask) { -			if (!cpu_active(dest_cpu)) -				continue; -			if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) +			if (is_cpu_allowed(p, dest_cpu))  				return dest_cpu;  		}  	} @@ -3131,8 +3333,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)  		/* No more Mr. Nice Guy. */  		switch (state) {  		case cpuset: -			if (IS_ENABLED(CONFIG_CPUSETS)) { -				cpuset_cpus_allowed_fallback(p); +			if (cpuset_cpus_allowed_fallback(p)) {  				state = possible;  				break;  			} @@ -3144,10 +3345,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)  			 *  			 * More yuck to audit.  			 */ -			do_set_cpus_allowed(p, cpu_possible_mask); +			do_set_cpus_allowed(p, task_cpu_possible_mask(p));  			state = fail;  			break; -  		case fail:  			BUG();  			break; @@ -5660,11 +5860,9 @@ static bool try_steal_cookie(int this, int that)  		if (p->core_occupation > dst->idle->core_occupation)  			goto next; -		p->on_rq = TASK_ON_RQ_MIGRATING;  		deactivate_task(src, p, 0);  		set_task_cpu(p, this);  		activate_task(dst, p, 0); -		p->on_rq = TASK_ON_RQ_QUEUED;  		resched_curr(dst); @@ -7300,6 +7498,16 @@ err_size:  	return -E2BIG;  } +static void get_params(struct task_struct *p, struct sched_attr *attr) +{ +	if (task_has_dl_policy(p)) +		__getparam_dl(p, attr); +	else if (task_has_rt_policy(p)) +		attr->sched_priority = p->rt_priority; +	else +		attr->sched_nice = task_nice(p); +} +  /**   * sys_sched_setscheduler - set/change the scheduler policy and RT priority   * @pid: the pid in question. @@ -7361,6 +7569,8 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,  	rcu_read_unlock();  	if (likely(p)) { +		if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) +			get_params(p, &attr);  		retval = sched_setattr(p, &attr);  		put_task_struct(p);  	} @@ -7509,12 +7719,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,  	kattr.sched_policy = p->policy;  	if (p->sched_reset_on_fork)  		kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; -	if (task_has_dl_policy(p)) -		__getparam_dl(p, &kattr); -	else if (task_has_rt_policy(p)) -		kattr.sched_priority = p->rt_priority; -	else -		kattr.sched_nice = task_nice(p); +	get_params(p, &kattr); +	kattr.sched_flags &= SCHED_FLAG_ALL;  #ifdef CONFIG_UCLAMP_TASK  	/* @@ -7535,9 +7741,76 @@ out_unlock:  	return retval;  } -long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +#ifdef CONFIG_SMP +int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)  { +	int ret = 0; + +	/* +	 * If the task isn't a deadline task or admission control is +	 * disabled then we don't care about affinity changes. +	 */ +	if (!task_has_dl_policy(p) || !dl_bandwidth_enabled()) +		return 0; + +	/* +	 * Since bandwidth control happens on root_domain basis, +	 * if admission test is enabled, we only admit -deadline +	 * tasks allowed to run on all the CPUs in the task's +	 * root_domain. +	 */ +	rcu_read_lock(); +	if (!cpumask_subset(task_rq(p)->rd->span, mask)) +		ret = -EBUSY; +	rcu_read_unlock(); +	return ret; +} +#endif + +static int +__sched_setaffinity(struct task_struct *p, const struct cpumask *mask) +{ +	int retval;  	cpumask_var_t cpus_allowed, new_mask; + +	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) +		return -ENOMEM; + +	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { +		retval = -ENOMEM; +		goto out_free_cpus_allowed; +	} + +	cpuset_cpus_allowed(p, cpus_allowed); +	cpumask_and(new_mask, mask, cpus_allowed); + +	retval = dl_task_check_affinity(p, new_mask); +	if (retval) +		goto out_free_new_mask; +again: +	retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER); +	if (retval) +		goto out_free_new_mask; + +	cpuset_cpus_allowed(p, cpus_allowed); +	if (!cpumask_subset(new_mask, cpus_allowed)) { +		/* +		 * We must have raced with a concurrent cpuset update. +		 * Just reset the cpumask to the cpuset's cpus_allowed. +		 */ +		cpumask_copy(new_mask, cpus_allowed); +		goto again; +	} + +out_free_new_mask: +	free_cpumask_var(new_mask); +out_free_cpus_allowed: +	free_cpumask_var(cpus_allowed); +	return retval; +} + +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{  	struct task_struct *p;  	int retval; @@ -7557,68 +7830,22 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)  		retval = -EINVAL;  		goto out_put_task;  	} -	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { -		retval = -ENOMEM; -		goto out_put_task; -	} -	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { -		retval = -ENOMEM; -		goto out_free_cpus_allowed; -	} -	retval = -EPERM; +  	if (!check_same_owner(p)) {  		rcu_read_lock();  		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {  			rcu_read_unlock(); -			goto out_free_new_mask; +			retval = -EPERM; +			goto out_put_task;  		}  		rcu_read_unlock();  	}  	retval = security_task_setscheduler(p);  	if (retval) -		goto out_free_new_mask; - - -	cpuset_cpus_allowed(p, cpus_allowed); -	cpumask_and(new_mask, in_mask, cpus_allowed); - -	/* -	 * Since bandwidth control happens on root_domain basis, -	 * if admission test is enabled, we only admit -deadline -	 * tasks allowed to run on all the CPUs in the task's -	 * root_domain. -	 */ -#ifdef CONFIG_SMP -	if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { -		rcu_read_lock(); -		if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { -			retval = -EBUSY; -			rcu_read_unlock(); -			goto out_free_new_mask; -		} -		rcu_read_unlock(); -	} -#endif -again: -	retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK); +		goto out_put_task; -	if (!retval) { -		cpuset_cpus_allowed(p, cpus_allowed); -		if (!cpumask_subset(new_mask, cpus_allowed)) { -			/* -			 * We must have raced with a concurrent cpuset -			 * update. Just reset the cpus_allowed to the -			 * cpuset's cpus_allowed -			 */ -			cpumask_copy(new_mask, cpus_allowed); -			goto again; -		} -	} -out_free_new_mask: -	free_cpumask_var(new_mask); -out_free_cpus_allowed: -	free_cpumask_var(cpus_allowed); +	retval = __sched_setaffinity(p, in_mask);  out_put_task:  	put_task_struct(p);  	return retval; @@ -9804,7 +10031,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,  	 * Prevent race between setting of cfs_rq->runtime_enabled and  	 * unthrottle_offline_cfs_rqs().  	 */ -	get_online_cpus(); +	cpus_read_lock();  	mutex_lock(&cfs_constraints_mutex);  	ret = __cfs_schedulable(tg, period, quota);  	if (ret) @@ -9848,7 +10075,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,  		cfs_bandwidth_usage_dec();  out_unlock:  	mutex_unlock(&cfs_constraints_mutex); -	put_online_cpus(); +	cpus_read_unlock();  	return ret;  } @@ -10099,6 +10326,20 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,  }  #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_FAIR_GROUP_SCHED +static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, +			       struct cftype *cft) +{ +	return css_tg(css)->idle; +} + +static int cpu_idle_write_s64(struct cgroup_subsys_state *css, +				struct cftype *cft, s64 idle) +{ +	return sched_group_set_idle(css_tg(css), idle); +} +#endif +  static struct cftype cpu_legacy_files[] = {  #ifdef CONFIG_FAIR_GROUP_SCHED  	{ @@ -10106,6 +10347,11 @@ static struct cftype cpu_legacy_files[] = {  		.read_u64 = cpu_shares_read_u64,  		.write_u64 = cpu_shares_write_u64,  	}, +	{ +		.name = "idle", +		.read_s64 = cpu_idle_read_s64, +		.write_s64 = cpu_idle_write_s64, +	},  #endif  #ifdef CONFIG_CFS_BANDWIDTH  	{ @@ -10313,6 +10559,12 @@ static struct cftype cpu_files[] = {  		.read_s64 = cpu_weight_nice_read_s64,  		.write_s64 = cpu_weight_nice_write_s64,  	}, +	{ +		.name = "idle", +		.flags = CFTYPE_NOT_ON_ROOT, +		.read_s64 = cpu_idle_read_s64, +		.write_s64 = cpu_idle_write_s64, +	},  #endif  #ifdef CONFIG_CFS_BANDWIDTH  	{ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index aaacd6cfd42f..e94314633b39 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1733,6 +1733,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused  	 */  	raw_spin_rq_lock(rq);  	if (p->dl.dl_non_contending) { +		update_rq_clock(rq);  		sub_running_bw(&p->dl, &rq->dl);  		p->dl.dl_non_contending = 0;  		/* @@ -2741,7 +2742,7 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr)  	dl_se->dl_runtime = attr->sched_runtime;  	dl_se->dl_deadline = attr->sched_deadline;  	dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; -	dl_se->flags = attr->sched_flags; +	dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS;  	dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);  	dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);  } @@ -2754,7 +2755,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)  	attr->sched_runtime = dl_se->dl_runtime;  	attr->sched_deadline = dl_se->dl_deadline;  	attr->sched_period = dl_se->dl_period; -	attr->sched_flags = dl_se->flags; +	attr->sched_flags &= ~SCHED_DL_FLAGS; +	attr->sched_flags |= dl_se->flags;  }  /* @@ -2851,7 +2853,7 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)  	if (dl_se->dl_runtime != attr->sched_runtime ||  	    dl_se->dl_deadline != attr->sched_deadline ||  	    dl_se->dl_period != attr->sched_period || -	    dl_se->flags != attr->sched_flags) +	    dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS))  		return true;  	return false; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0c5ec2776ddf..49716228efb4 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -388,6 +388,13 @@ void update_sched_domain_debugfs(void)  {  	int cpu, i; +	/* +	 * This can unfortunately be invoked before sched_debug_init() creates +	 * the debug directory. Don't touch sd_sysctl_cpus until then. +	 */ +	if (!debugfs_sched) +		return; +  	if (!cpumask_available(sd_sysctl_cpus)) {  		if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))  			return; @@ -600,6 +607,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)  	SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",  			cfs_rq->nr_spread_over);  	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running); +	SEQ_printf(m, "  .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); +	SEQ_printf(m, "  .%-30s: %d\n", "idle_h_nr_running", +			cfs_rq->idle_h_nr_running);  	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);  #ifdef CONFIG_SMP  	SEQ_printf(m, "  .%-30s: %lu\n", "load_avg", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 44c452072a1b..5aa3cfd15a2e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -431,6 +431,23 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)  	}  } +static int tg_is_idle(struct task_group *tg) +{ +	return tg->idle > 0; +} + +static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) +{ +	return cfs_rq->idle > 0; +} + +static int se_is_idle(struct sched_entity *se) +{ +	if (entity_is_task(se)) +		return task_has_idle_policy(task_of(se)); +	return cfs_rq_is_idle(group_cfs_rq(se)); +} +  #else	/* !CONFIG_FAIR_GROUP_SCHED */  #define for_each_sched_entity(se) \ @@ -468,6 +485,21 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)  {  } +static int tg_is_idle(struct task_group *tg) +{ +	return 0; +} + +static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) +{ +	return 0; +} + +static int se_is_idle(struct sched_entity *se) +{ +	return 0; +} +  #endif	/* CONFIG_FAIR_GROUP_SCHED */  static __always_inline @@ -1486,7 +1518,7 @@ static inline bool is_core_idle(int cpu)  		if (cpu == sibling)  			continue; -		if (!idle_cpu(cpu)) +		if (!idle_cpu(sibling))  			return false;  	}  #endif @@ -4841,6 +4873,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)  		dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); +		if (cfs_rq_is_idle(group_cfs_rq(se))) +			idle_task_delta = cfs_rq->h_nr_running; +  		qcfs_rq->h_nr_running -= task_delta;  		qcfs_rq->idle_h_nr_running -= idle_task_delta; @@ -4860,6 +4895,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)  		update_load_avg(qcfs_rq, se, 0);  		se_update_runnable(se); +		if (cfs_rq_is_idle(group_cfs_rq(se))) +			idle_task_delta = cfs_rq->h_nr_running; +  		qcfs_rq->h_nr_running -= task_delta;  		qcfs_rq->idle_h_nr_running -= idle_task_delta;  	} @@ -4904,39 +4942,45 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)  	task_delta = cfs_rq->h_nr_running;  	idle_task_delta = cfs_rq->idle_h_nr_running;  	for_each_sched_entity(se) { +		struct cfs_rq *qcfs_rq = cfs_rq_of(se); +  		if (se->on_rq)  			break; -		cfs_rq = cfs_rq_of(se); -		enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); +		enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); + +		if (cfs_rq_is_idle(group_cfs_rq(se))) +			idle_task_delta = cfs_rq->h_nr_running; -		cfs_rq->h_nr_running += task_delta; -		cfs_rq->idle_h_nr_running += idle_task_delta; +		qcfs_rq->h_nr_running += task_delta; +		qcfs_rq->idle_h_nr_running += idle_task_delta;  		/* end evaluation on encountering a throttled cfs_rq */ -		if (cfs_rq_throttled(cfs_rq)) +		if (cfs_rq_throttled(qcfs_rq))  			goto unthrottle_throttle;  	}  	for_each_sched_entity(se) { -		cfs_rq = cfs_rq_of(se); +		struct cfs_rq *qcfs_rq = cfs_rq_of(se); -		update_load_avg(cfs_rq, se, UPDATE_TG); +		update_load_avg(qcfs_rq, se, UPDATE_TG);  		se_update_runnable(se); -		cfs_rq->h_nr_running += task_delta; -		cfs_rq->idle_h_nr_running += idle_task_delta; +		if (cfs_rq_is_idle(group_cfs_rq(se))) +			idle_task_delta = cfs_rq->h_nr_running; +		qcfs_rq->h_nr_running += task_delta; +		qcfs_rq->idle_h_nr_running += idle_task_delta;  		/* end evaluation on encountering a throttled cfs_rq */ -		if (cfs_rq_throttled(cfs_rq)) +		if (cfs_rq_throttled(qcfs_rq))  			goto unthrottle_throttle;  		/*  		 * One parent has been throttled and cfs_rq removed from the  		 * list. Add it back to not break the leaf list.  		 */ -		if (throttled_hierarchy(cfs_rq)) -			list_add_leaf_cfs_rq(cfs_rq); +		if (throttled_hierarchy(qcfs_rq)) +			list_add_leaf_cfs_rq(qcfs_rq);  	}  	/* At this point se is NULL and we are at root level*/ @@ -4949,9 +4993,9 @@ unthrottle_throttle:  	 * assertion below.  	 */  	for_each_sched_entity(se) { -		cfs_rq = cfs_rq_of(se); +		struct cfs_rq *qcfs_rq = cfs_rq_of(se); -		if (list_add_leaf_cfs_rq(cfs_rq)) +		if (list_add_leaf_cfs_rq(qcfs_rq))  			break;  	} @@ -5574,6 +5618,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)  		cfs_rq->h_nr_running++;  		cfs_rq->idle_h_nr_running += idle_h_nr_running; +		if (cfs_rq_is_idle(cfs_rq)) +			idle_h_nr_running = 1; +  		/* end evaluation on encountering a throttled cfs_rq */  		if (cfs_rq_throttled(cfs_rq))  			goto enqueue_throttle; @@ -5591,6 +5638,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)  		cfs_rq->h_nr_running++;  		cfs_rq->idle_h_nr_running += idle_h_nr_running; +		if (cfs_rq_is_idle(cfs_rq)) +			idle_h_nr_running = 1; +  		/* end evaluation on encountering a throttled cfs_rq */  		if (cfs_rq_throttled(cfs_rq))  			goto enqueue_throttle; @@ -5668,6 +5718,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)  		cfs_rq->h_nr_running--;  		cfs_rq->idle_h_nr_running -= idle_h_nr_running; +		if (cfs_rq_is_idle(cfs_rq)) +			idle_h_nr_running = 1; +  		/* end evaluation on encountering a throttled cfs_rq */  		if (cfs_rq_throttled(cfs_rq))  			goto dequeue_throttle; @@ -5697,6 +5750,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)  		cfs_rq->h_nr_running--;  		cfs_rq->idle_h_nr_running -= idle_h_nr_running; +		if (cfs_rq_is_idle(cfs_rq)) +			idle_h_nr_running = 1; +  		/* end evaluation on encountering a throttled cfs_rq */  		if (cfs_rq_throttled(cfs_rq))  			goto dequeue_throttle; @@ -6249,7 +6305,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool  		time = cpu_clock(this);  	} -	for_each_cpu_wrap(cpu, cpus, target) { +	for_each_cpu_wrap(cpu, cpus, target + 1) {  		if (has_idle_core) {  			i = select_idle_core(p, cpu, cpus, &idle_cpu);  			if ((unsigned int)i < nr_cpumask_bits) @@ -6376,6 +6432,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)  	/* Check a recently used CPU as a potential idle candidate: */  	recent_used_cpu = p->recent_used_cpu; +	p->recent_used_cpu = prev;  	if (recent_used_cpu != prev &&  	    recent_used_cpu != target &&  	    cpus_share_cache(recent_used_cpu, target) && @@ -6902,9 +6959,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)  	} else if (wake_flags & WF_TTWU) { /* XXX always ? */  		/* Fast path */  		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - -		if (want_affine) -			current->recent_used_cpu = cpu;  	}  	rcu_read_unlock(); @@ -7041,24 +7095,22 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)  static void set_last_buddy(struct sched_entity *se)  { -	if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) -		return; -  	for_each_sched_entity(se) {  		if (SCHED_WARN_ON(!se->on_rq))  			return; +		if (se_is_idle(se)) +			return;  		cfs_rq_of(se)->last = se;  	}  }  static void set_next_buddy(struct sched_entity *se)  { -	if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) -		return; -  	for_each_sched_entity(se) {  		if (SCHED_WARN_ON(!se->on_rq))  			return; +		if (se_is_idle(se)) +			return;  		cfs_rq_of(se)->next = se;  	}  } @@ -7079,6 +7131,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_  	struct cfs_rq *cfs_rq = task_cfs_rq(curr);  	int scale = cfs_rq->nr_running >= sched_nr_latency;  	int next_buddy_marked = 0; +	int cse_is_idle, pse_is_idle;  	if (unlikely(se == pse))  		return; @@ -7123,8 +7176,21 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_  		return;  	find_matching_se(&se, &pse); -	update_curr(cfs_rq_of(se));  	BUG_ON(!pse); + +	cse_is_idle = se_is_idle(se); +	pse_is_idle = se_is_idle(pse); + +	/* +	 * Preempt an idle group in favor of a non-idle group (and don't preempt +	 * in the inverse case). +	 */ +	if (cse_is_idle && !pse_is_idle) +		goto preempt; +	if (cse_is_idle != pse_is_idle) +		return; + +	update_curr(cfs_rq_of(se));  	if (wakeup_preempt_entity(se, pse) == 1) {  		/*  		 * Bias pick_next to pick the sched entity that is @@ -10217,9 +10283,11 @@ static inline int on_null_domain(struct rq *rq)  static inline int find_new_ilb(void)  {  	int ilb; +	const struct cpumask *hk_mask; + +	hk_mask = housekeeping_cpumask(HK_FLAG_MISC); -	for_each_cpu_and(ilb, nohz.idle_cpus_mask, -			      housekeeping_cpumask(HK_FLAG_MISC)) { +	for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {  		if (ilb == smp_processor_id())  			continue; @@ -11416,10 +11484,12 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,  static DEFINE_MUTEX(shares_mutex); -int sched_group_set_shares(struct task_group *tg, unsigned long shares) +static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)  {  	int i; +	lockdep_assert_held(&shares_mutex); +  	/*  	 * We can't change the weight of the root cgroup.  	 */ @@ -11428,9 +11498,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)  	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); -	mutex_lock(&shares_mutex);  	if (tg->shares == shares) -		goto done; +		return 0;  	tg->shares = shares;  	for_each_possible_cpu(i) { @@ -11448,10 +11517,88 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)  		rq_unlock_irqrestore(rq, &rf);  	} -done: +	return 0; +} + +int sched_group_set_shares(struct task_group *tg, unsigned long shares) +{ +	int ret; + +	mutex_lock(&shares_mutex); +	if (tg_is_idle(tg)) +		ret = -EINVAL; +	else +		ret = __sched_group_set_shares(tg, shares); +	mutex_unlock(&shares_mutex); + +	return ret; +} + +int sched_group_set_idle(struct task_group *tg, long idle) +{ +	int i; + +	if (tg == &root_task_group) +		return -EINVAL; + +	if (idle < 0 || idle > 1) +		return -EINVAL; + +	mutex_lock(&shares_mutex); + +	if (tg->idle == idle) { +		mutex_unlock(&shares_mutex); +		return 0; +	} + +	tg->idle = idle; + +	for_each_possible_cpu(i) { +		struct rq *rq = cpu_rq(i); +		struct sched_entity *se = tg->se[i]; +		struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i]; +		bool was_idle = cfs_rq_is_idle(grp_cfs_rq); +		long idle_task_delta; +		struct rq_flags rf; + +		rq_lock_irqsave(rq, &rf); + +		grp_cfs_rq->idle = idle; +		if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq))) +			goto next_cpu; + +		idle_task_delta = grp_cfs_rq->h_nr_running - +				  grp_cfs_rq->idle_h_nr_running; +		if (!cfs_rq_is_idle(grp_cfs_rq)) +			idle_task_delta *= -1; + +		for_each_sched_entity(se) { +			struct cfs_rq *cfs_rq = cfs_rq_of(se); + +			if (!se->on_rq) +				break; + +			cfs_rq->idle_h_nr_running += idle_task_delta; + +			/* Already accounted at parent level and above. */ +			if (cfs_rq_is_idle(cfs_rq)) +				break; +		} + +next_cpu: +		rq_unlock_irqrestore(rq, &rf); +	} + +	/* Idle groups have minimum weight. */ +	if (tg_is_idle(tg)) +		__sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO)); +	else +		__sched_group_set_shares(tg, NICE_0_LOAD); +  	mutex_unlock(&shares_mutex);  	return 0;  } +  #else /* CONFIG_FAIR_GROUP_SCHED */  void free_fair_sched_group(struct task_group *tg) { } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 14a41a243f7b..a9a660c6e08a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -227,6 +227,8 @@ static inline void update_avg(u64 *avg, u64 sample)   */  #define SCHED_FLAG_SUGOV	0x10000000 +#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) +  static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)  {  #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL @@ -394,6 +396,9 @@ struct task_group {  	struct cfs_rq		**cfs_rq;  	unsigned long		shares; +	/* A positive value indicates that this is a SCHED_IDLE group. */ +	int			idle; +  #ifdef	CONFIG_SMP  	/*  	 * load_avg can be heavily contended at clock tick time, so put @@ -503,6 +508,8 @@ extern void sched_move_task(struct task_struct *tsk);  #ifdef CONFIG_FAIR_GROUP_SCHED  extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); +extern int sched_group_set_idle(struct task_group *tg, long idle); +  #ifdef CONFIG_SMP  extern void set_task_rq_fair(struct sched_entity *se,  			     struct cfs_rq *prev, struct cfs_rq *next); @@ -599,6 +606,9 @@ struct cfs_rq {  	struct list_head	leaf_cfs_rq_list;  	struct task_group	*tg;	/* group that "owns" this runqueue */ +	/* Locally cached copy of our task_group's idle value */ +	int			idle; +  #ifdef CONFIG_CFS_BANDWIDTH  	int			runtime_enabled;  	s64			runtime_remaining; @@ -2234,6 +2244,7 @@ extern struct task_struct *pick_next_task_idle(struct rq *rq);  #define SCA_CHECK		0x01  #define SCA_MIGRATE_DISABLE	0x02  #define SCA_MIGRATE_ENABLE	0x04 +#define SCA_USER		0x08  #ifdef CONFIG_SMP @@ -2385,6 +2396,21 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);  extern const_debug unsigned int sysctl_sched_nr_migrate;  extern const_debug unsigned int sysctl_sched_migration_cost; +#ifdef CONFIG_SCHED_DEBUG +extern unsigned int sysctl_sched_latency; +extern unsigned int sysctl_sched_min_granularity; +extern unsigned int sysctl_sched_wakeup_granularity; +extern int sysctl_resched_latency_warn_ms; +extern int sysctl_resched_latency_warn_once; + +extern unsigned int sysctl_sched_tunable_scaling; + +extern unsigned int sysctl_numa_balancing_scan_delay; +extern unsigned int sysctl_numa_balancing_scan_period_min; +extern unsigned int sysctl_numa_balancing_scan_period_max; +extern unsigned int sysctl_numa_balancing_scan_size; +#endif +  #ifdef CONFIG_SCHED_HRTICK  /* diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b77ad49dc14f..4e8698e62f07 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1482,6 +1482,8 @@ int				sched_max_numa_distance;  static int			*sched_domains_numa_distance;  static struct cpumask		***sched_domains_numa_masks;  int __read_mostly		node_reclaim_distance = RECLAIM_DISTANCE; + +static unsigned long __read_mostly *sched_numa_onlined_nodes;  #endif  /* @@ -1833,6 +1835,16 @@ void sched_init_numa(void)  			sched_domains_numa_masks[i][j] = mask;  			for_each_node(k) { +				/* +				 * Distance information can be unreliable for +				 * offline nodes, defer building the node +				 * masks to its bringup. +				 * This relies on all unique distance values +				 * still being visible at init time. +				 */ +				if (!node_online(j)) +					continue; +  				if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))  					sched_numa_warn("Node-distance not symmetric"); @@ -1886,6 +1898,53 @@ void sched_init_numa(void)  	sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];  	init_numa_topology_type(); + +	sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL); +	if (!sched_numa_onlined_nodes) +		return; + +	bitmap_zero(sched_numa_onlined_nodes, nr_node_ids); +	for_each_online_node(i) +		bitmap_set(sched_numa_onlined_nodes, i, 1); +} + +static void __sched_domains_numa_masks_set(unsigned int node) +{ +	int i, j; + +	/* +	 * NUMA masks are not built for offline nodes in sched_init_numa(). +	 * Thus, when a CPU of a never-onlined-before node gets plugged in, +	 * adding that new CPU to the right NUMA masks is not sufficient: the +	 * masks of that CPU's node must also be updated. +	 */ +	if (test_bit(node, sched_numa_onlined_nodes)) +		return; + +	bitmap_set(sched_numa_onlined_nodes, node, 1); + +	for (i = 0; i < sched_domains_numa_levels; i++) { +		for (j = 0; j < nr_node_ids; j++) { +			if (!node_online(j) || node == j) +				continue; + +			if (node_distance(j, node) > sched_domains_numa_distance[i]) +				continue; + +			/* Add remote nodes in our masks */ +			cpumask_or(sched_domains_numa_masks[i][node], +				   sched_domains_numa_masks[i][node], +				   sched_domains_numa_masks[0][j]); +		} +	} + +	/* +	 * A new node has been brought up, potentially changing the topology +	 * classification. +	 * +	 * Note that this is racy vs any use of sched_numa_topology_type :/ +	 */ +	init_numa_topology_type();  }  void sched_domains_numa_masks_set(unsigned int cpu) @@ -1893,8 +1952,14 @@ void sched_domains_numa_masks_set(unsigned int cpu)  	int node = cpu_to_node(cpu);  	int i, j; +	__sched_domains_numa_masks_set(node); +  	for (i = 0; i < sched_domains_numa_levels; i++) {  		for (j = 0; j < nr_node_ids; j++) { +			if (!node_online(j)) +				continue; + +			/* Set ourselves in the remote node's masks */  			if (node_distance(j, node) <= sched_domains_numa_distance[i])  				cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);  		} | 
