diff options
Diffstat (limited to 'kernel/sched.c')
| -rw-r--r-- | kernel/sched.c | 750 | 
1 files changed, 579 insertions, 171 deletions
| diff --git a/kernel/sched.c b/kernel/sched.c index ccacdbdecf45..0e9344a71be3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -196,10 +196,28 @@ static inline int rt_bandwidth_enabled(void)  	return sysctl_sched_rt_runtime >= 0;  } -static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)  { -	ktime_t now; +	unsigned long delta; +	ktime_t soft, hard, now; + +	for (;;) { +		if (hrtimer_active(period_timer)) +			break; + +		now = hrtimer_cb_get_time(period_timer); +		hrtimer_forward(period_timer, now, period); +		soft = hrtimer_get_softexpires(period_timer); +		hard = hrtimer_get_expires(period_timer); +		delta = ktime_to_ns(ktime_sub(hard, soft)); +		__hrtimer_start_range_ns(period_timer, soft, delta, +					 HRTIMER_MODE_ABS_PINNED, 0); +	} +} + +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{  	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)  		return; @@ -207,22 +225,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)  		return;  	raw_spin_lock(&rt_b->rt_runtime_lock); -	for (;;) { -		unsigned long delta; -		ktime_t soft, hard; - -		if (hrtimer_active(&rt_b->rt_period_timer)) -			break; - -		now = hrtimer_cb_get_time(&rt_b->rt_period_timer); -		hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); - -		soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); -		hard = hrtimer_get_expires(&rt_b->rt_period_timer); -		delta = ktime_to_ns(ktime_sub(hard, soft)); -		__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, -				HRTIMER_MODE_ABS_PINNED, 0); -	} +	start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);  	raw_spin_unlock(&rt_b->rt_runtime_lock);  } @@ -247,6 +250,24 @@ struct cfs_rq;  static LIST_HEAD(task_groups); +struct cfs_bandwidth { +#ifdef CONFIG_CFS_BANDWIDTH +	raw_spinlock_t lock; +	ktime_t period; +	u64 quota, runtime; +	s64 hierarchal_quota; +	u64 runtime_expires; + +	int idle, timer_active; +	struct hrtimer period_timer, slack_timer; +	struct list_head throttled_cfs_rq; + +	/* statistics */ +	int nr_periods, nr_throttled; +	u64 throttled_time; +#endif +}; +  /* task group related information */  struct task_group {  	struct cgroup_subsys_state css; @@ -278,6 +299,8 @@ struct task_group {  #ifdef CONFIG_SCHED_AUTOGROUP  	struct autogroup *autogroup;  #endif + +	struct cfs_bandwidth cfs_bandwidth;  };  /* task_group_lock serializes the addition/removal of task groups */ @@ -311,7 +334,7 @@ struct task_group root_task_group;  /* CFS-related fields in a runqueue */  struct cfs_rq {  	struct load_weight load; -	unsigned long nr_running; +	unsigned long nr_running, h_nr_running;  	u64 exec_clock;  	u64 min_vruntime; @@ -377,9 +400,120 @@ struct cfs_rq {  	unsigned long load_contribution;  #endif +#ifdef CONFIG_CFS_BANDWIDTH +	int runtime_enabled; +	u64 runtime_expires; +	s64 runtime_remaining; + +	u64 throttled_timestamp; +	int throttled, throttle_count; +	struct list_head throttled_list; +#endif  #endif  }; +#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_CFS_BANDWIDTH +static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) +{ +	return &tg->cfs_bandwidth; +} + +static inline u64 default_cfs_period(void); +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); +static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); + +static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) +{ +	struct cfs_bandwidth *cfs_b = +		container_of(timer, struct cfs_bandwidth, slack_timer); +	do_sched_cfs_slack_timer(cfs_b); + +	return HRTIMER_NORESTART; +} + +static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) +{ +	struct cfs_bandwidth *cfs_b = +		container_of(timer, struct cfs_bandwidth, period_timer); +	ktime_t now; +	int overrun; +	int idle = 0; + +	for (;;) { +		now = hrtimer_cb_get_time(timer); +		overrun = hrtimer_forward(timer, now, cfs_b->period); + +		if (!overrun) +			break; + +		idle = do_sched_cfs_period_timer(cfs_b, overrun); +	} + +	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} + +static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ +	raw_spin_lock_init(&cfs_b->lock); +	cfs_b->runtime = 0; +	cfs_b->quota = RUNTIME_INF; +	cfs_b->period = ns_to_ktime(default_cfs_period()); + +	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); +	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	cfs_b->period_timer.function = sched_cfs_period_timer; +	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	cfs_b->slack_timer.function = sched_cfs_slack_timer; +} + +static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ +	cfs_rq->runtime_enabled = 0; +	INIT_LIST_HEAD(&cfs_rq->throttled_list); +} + +/* requires cfs_b->lock, may release to reprogram timer */ +static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ +	/* +	 * The timer may be active because we're trying to set a new bandwidth +	 * period or because we're racing with the tear-down path +	 * (timer_active==0 becomes visible before the hrtimer call-back +	 * terminates).  In either case we ensure that it's re-programmed +	 */ +	while (unlikely(hrtimer_active(&cfs_b->period_timer))) { +		raw_spin_unlock(&cfs_b->lock); +		/* ensure cfs_b->lock is available while we wait */ +		hrtimer_cancel(&cfs_b->period_timer); + +		raw_spin_lock(&cfs_b->lock); +		/* if someone else restarted the timer then we're done */ +		if (cfs_b->timer_active) +			return; +	} + +	cfs_b->timer_active = 1; +	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); +} + +static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ +	hrtimer_cancel(&cfs_b->period_timer); +	hrtimer_cancel(&cfs_b->slack_timer); +} +#else +static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} +static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} + +static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) +{ +	return NULL; +} +#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ +  /* Real-Time classes' related field in a runqueue: */  struct rt_rq {  	struct rt_prio_array active; @@ -510,7 +644,7 @@ struct rq {  	unsigned long cpu_power; -	unsigned char idle_at_tick; +	unsigned char idle_balance;  	/* For active balancing */  	int post_schedule;  	int active_balance; @@ -520,8 +654,6 @@ struct rq {  	int cpu;  	int online; -	unsigned long avg_load_per_task; -  	u64 rt_avg;  	u64 age_stamp;  	u64 idle_stamp; @@ -570,7 +702,7 @@ struct rq {  #endif  #ifdef CONFIG_SMP -	struct task_struct *wake_list; +	struct llist_head wake_list;  #endif  }; @@ -1272,6 +1404,18 @@ void wake_up_idle_cpu(int cpu)  		smp_send_reschedule(cpu);  } +static inline bool got_nohz_idle_kick(void) +{ +	return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick; +} + +#else /* CONFIG_NO_HZ */ + +static inline bool got_nohz_idle_kick(void) +{ +	return false; +} +  #endif /* CONFIG_NO_HZ */  static u64 sched_avg_period(void) @@ -1471,24 +1615,28 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)  	update_load_sub(&rq->load, load);  } -#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) +#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ +			(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))  typedef int (*tg_visitor)(struct task_group *, void *);  /* - * Iterate the full tree, calling @down when first entering a node and @up when - * leaving it for the final time. + * Iterate task_group tree rooted at *from, calling @down when first entering a + * node and @up when leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent.   */ -static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +static int walk_tg_tree_from(struct task_group *from, +			     tg_visitor down, tg_visitor up, void *data)  {  	struct task_group *parent, *child;  	int ret; -	rcu_read_lock(); -	parent = &root_task_group; +	parent = from; +  down:  	ret = (*down)(parent, data);  	if (ret) -		goto out_unlock; +		goto out;  	list_for_each_entry_rcu(child, &parent->children, siblings) {  		parent = child;  		goto down; @@ -1497,19 +1645,29 @@ up:  		continue;  	}  	ret = (*up)(parent, data); -	if (ret) -		goto out_unlock; +	if (ret || parent == from) +		goto out;  	child = parent;  	parent = parent->parent;  	if (parent)  		goto up; -out_unlock: -	rcu_read_unlock(); - +out:  	return ret;  } +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent. + */ + +static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +{ +	return walk_tg_tree_from(&root_task_group, down, up, data); +} +  static int tg_nop(struct task_group *tg, void *data)  {  	return 0; @@ -1569,11 +1727,9 @@ static unsigned long cpu_avg_load_per_task(int cpu)  	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);  	if (nr_running) -		rq->avg_load_per_task = rq->load.weight / nr_running; -	else -		rq->avg_load_per_task = 0; +		return rq->load.weight / nr_running; -	return rq->avg_load_per_task; +	return 0;  }  #ifdef CONFIG_PREEMPT @@ -1739,7 +1895,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)  #ifdef CONFIG_SMP  	/*  	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be -	 * successfuly executed on another CPU. We must ensure that updates of +	 * successfully executed on another CPU. We must ensure that updates of  	 * per-task data have been completed by this moment.  	 */  	smp_wmb(); @@ -1806,7 +1962,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)  		rq->nr_uninterruptible--;  	enqueue_task(rq, p, flags); -	inc_nr_running(rq);  }  /* @@ -1818,7 +1973,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)  		rq->nr_uninterruptible++;  	dequeue_task(rq, p, flags); -	dec_nr_running(rq);  }  #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -2390,11 +2544,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p)  	/* Look for allowed, online CPU in same node. */  	for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) -		if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) +		if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))  			return dest_cpu;  	/* Any allowed, online CPU? */ -	dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); +	dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);  	if (dest_cpu < nr_cpu_ids)  		return dest_cpu; @@ -2431,7 +2585,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)  	 * [ this allows ->select_task() to simply return task_cpu(p) and  	 *   not worry about this generic constraint ]  	 */ -	if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || +	if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||  		     !cpu_online(cpu)))  		cpu = select_fallback_rq(task_cpu(p), p); @@ -2556,42 +2710,26 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)  }  #ifdef CONFIG_SMP -static void sched_ttwu_do_pending(struct task_struct *list) +static void sched_ttwu_pending(void)  {  	struct rq *rq = this_rq(); +	struct llist_node *llist = llist_del_all(&rq->wake_list); +	struct task_struct *p;  	raw_spin_lock(&rq->lock); -	while (list) { -		struct task_struct *p = list; -		list = list->wake_entry; +	while (llist) { +		p = llist_entry(llist, struct task_struct, wake_entry); +		llist = llist_next(llist);  		ttwu_do_activate(rq, p, 0);  	}  	raw_spin_unlock(&rq->lock);  } -#ifdef CONFIG_HOTPLUG_CPU - -static void sched_ttwu_pending(void) -{ -	struct rq *rq = this_rq(); -	struct task_struct *list = xchg(&rq->wake_list, NULL); - -	if (!list) -		return; - -	sched_ttwu_do_pending(list); -} - -#endif /* CONFIG_HOTPLUG_CPU */ -  void scheduler_ipi(void)  { -	struct rq *rq = this_rq(); -	struct task_struct *list = xchg(&rq->wake_list, NULL); - -	if (!list) +	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())  		return;  	/* @@ -2608,25 +2746,21 @@ void scheduler_ipi(void)  	 * somewhat pessimize the simple resched case.  	 */  	irq_enter(); -	sched_ttwu_do_pending(list); +	sched_ttwu_pending(); + +	/* +	 * Check if someone kicked us for doing the nohz idle load balance. +	 */ +	if (unlikely(got_nohz_idle_kick() && !need_resched())) { +		this_rq()->idle_balance = 1; +		raise_softirq_irqoff(SCHED_SOFTIRQ); +	}  	irq_exit();  }  static void ttwu_queue_remote(struct task_struct *p, int cpu)  { -	struct rq *rq = cpu_rq(cpu); -	struct task_struct *next = rq->wake_list; - -	for (;;) { -		struct task_struct *old = next; - -		p->wake_entry = next; -		next = cmpxchg(&rq->wake_list, old, p); -		if (next == old) -			break; -	} - -	if (!next) +	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))  		smp_send_reschedule(cpu);  } @@ -2848,19 +2982,23 @@ void sched_fork(struct task_struct *p)  	p->state = TASK_RUNNING;  	/* +	 * Make sure we do not leak PI boosting priority to the child. +	 */ +	p->prio = current->normal_prio; + +	/*  	 * Revert to default priority/policy on fork if requested.  	 */  	if (unlikely(p->sched_reset_on_fork)) { -		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { +		if (task_has_rt_policy(p)) {  			p->policy = SCHED_NORMAL; -			p->normal_prio = p->static_prio; -		} - -		if (PRIO_TO_NICE(p->static_prio) < 0) {  			p->static_prio = NICE_TO_PRIO(0); -			p->normal_prio = p->static_prio; -			set_load_weight(p); -		} +			p->rt_priority = 0; +		} else if (PRIO_TO_NICE(p->static_prio) < 0) +			p->static_prio = NICE_TO_PRIO(0); + +		p->prio = p->normal_prio = __normal_prio(p); +		set_load_weight(p);  		/*  		 * We don't need the reset flag anymore after the fork. It has @@ -2869,11 +3007,6 @@ void sched_fork(struct task_struct *p)  		p->sched_reset_on_fork = 0;  	} -	/* -	 * Make sure we do not leak PI boosting priority to the child. -	 */ -	p->prio = current->normal_prio; -  	if (!rt_prio(p->prio))  		p->sched_class = &fair_sched_class; @@ -3065,7 +3198,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)  #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW  	local_irq_disable();  #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ -	perf_event_task_sched_in(current); +	perf_event_task_sched_in(prev, current);  #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW  	local_irq_enable();  #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ @@ -3725,30 +3858,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)  }  /* - * Return sum_exec_runtime for the thread group. - * In case the task is currently running, return the sum plus current's - * pending runtime that have not been accounted yet. - * - * Note that the thread group might have other running tasks as well, - * so the return value not includes other pending runtime that other - * running tasks might have. - */ -unsigned long long thread_group_sched_runtime(struct task_struct *p) -{ -	struct task_cputime totals; -	unsigned long flags; -	struct rq *rq; -	u64 ns; - -	rq = task_rq_lock(p, &flags); -	thread_group_cputime(p, &totals); -	ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); -	task_rq_unlock(rq, p, &flags); - -	return ns; -} - -/*   * Account user cpu time to a process.   * @p: the process that the cpu time gets accounted to   * @cputime: the cpu time spent in user space since the last update @@ -4140,7 +4249,7 @@ void scheduler_tick(void)  	perf_event_task_tick();  #ifdef CONFIG_SMP -	rq->idle_at_tick = idle_cpu(cpu); +	rq->idle_balance = idle_cpu(cpu);  	trigger_load_balance(rq, cpu);  #endif  } @@ -4237,6 +4346,7 @@ static inline void schedule_debug(struct task_struct *prev)  	 */  	if (unlikely(in_atomic_preempt_off() && !prev->exit_state))  		__schedule_bug(prev); +	rcu_sleep_check();  	profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -4263,7 +4373,7 @@ pick_next_task(struct rq *rq)  	 * Optimization: we know that if all tasks are in  	 * the fair class we can call that function directly:  	 */ -	if (likely(rq->nr_running == rq->cfs.nr_running)) { +	if (likely(rq->nr_running == rq->cfs.h_nr_running)) {  		p = fair_sched_class.pick_next_task(rq);  		if (likely(p))  			return p; @@ -4279,9 +4389,9 @@ pick_next_task(struct rq *rq)  }  /* - * schedule() is the main scheduler function. + * __schedule() is the main scheduler function.   */ -asmlinkage void __sched schedule(void) +static void __sched __schedule(void)  {  	struct task_struct *prev, *next;  	unsigned long *switch_count; @@ -4322,16 +4432,6 @@ need_resched:  				if (to_wakeup)  					try_to_wake_up_local(to_wakeup);  			} - -			/* -			 * If we are going to sleep and we have plugged IO -			 * queued, make sure to submit it to avoid deadlocks. -			 */ -			if (blk_needs_flush_plug(prev)) { -				raw_spin_unlock(&rq->lock); -				blk_schedule_flush_plug(prev); -				raw_spin_lock(&rq->lock); -			}  		}  		switch_count = &prev->nvcsw;  	} @@ -4369,6 +4469,26 @@ need_resched:  	if (need_resched())  		goto need_resched;  } + +static inline void sched_submit_work(struct task_struct *tsk) +{ +	if (!tsk->state) +		return; +	/* +	 * If we are going to sleep and we have plugged IO queued, +	 * make sure to submit it to avoid deadlocks. +	 */ +	if (blk_needs_flush_plug(tsk)) +		blk_schedule_flush_plug(tsk); +} + +asmlinkage void __sched schedule(void) +{ +	struct task_struct *tsk = current; + +	sched_submit_work(tsk); +	__schedule(); +}  EXPORT_SYMBOL(schedule);  #ifdef CONFIG_MUTEX_SPIN_ON_OWNER @@ -4435,7 +4555,7 @@ asmlinkage void __sched notrace preempt_schedule(void)  	do {  		add_preempt_count_notrace(PREEMPT_ACTIVE); -		schedule(); +		__schedule();  		sub_preempt_count_notrace(PREEMPT_ACTIVE);  		/* @@ -4463,7 +4583,7 @@ asmlinkage void __sched preempt_schedule_irq(void)  	do {  		add_preempt_count(PREEMPT_ACTIVE);  		local_irq_enable(); -		schedule(); +		__schedule();  		local_irq_disable();  		sub_preempt_count(PREEMPT_ACTIVE); @@ -5039,7 +5159,20 @@ EXPORT_SYMBOL(task_nice);   */  int idle_cpu(int cpu)  { -	return cpu_curr(cpu) == cpu_rq(cpu)->idle; +	struct rq *rq = cpu_rq(cpu); + +	if (rq->curr != rq->idle) +		return 0; + +	if (rq->nr_running) +		return 0; + +#ifdef CONFIG_SMP +	if (!llist_empty(&rq->wake_list)) +		return 0; +#endif + +	return 1;  }  /** @@ -5588,7 +5721,7 @@ static inline int should_resched(void)  static void __cond_resched(void)  {  	add_preempt_count(PREEMPT_ACTIVE); -	schedule(); +	__schedule();  	sub_preempt_count(PREEMPT_ACTIVE);  } @@ -5889,7 +6022,7 @@ void show_state_filter(unsigned long state_filter)  	printk(KERN_INFO  		"  task                        PC stack   pid father\n");  #endif -	read_lock(&tasklist_lock); +	rcu_read_lock();  	do_each_thread(g, p) {  		/*  		 * reset the NMI-timeout, listing all files on a slow @@ -5905,7 +6038,7 @@ void show_state_filter(unsigned long state_filter)  #ifdef CONFIG_SCHED_DEBUG  	sysrq_sched_debug_show();  #endif -	read_unlock(&tasklist_lock); +	rcu_read_unlock();  	/*  	 * Only show locks if all tasks are dumped:  	 */ @@ -5969,15 +6102,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)  }  /* - * In a system that switches off the HZ timer nohz_cpu_mask - * indicates which cpus entered this state. This is used - * in the rcu update to wait only for active cpus. For system - * which do not switch off the HZ timer nohz_cpu_mask should - * always be CPU_BITS_NONE. - */ -cpumask_var_t nohz_cpu_mask; - -/*   * Increase the granularity value when there are more CPUs,   * because with more CPUs the 'effective latency' as visible   * to users decreases. But the relationship is not linear, @@ -6029,10 +6153,9 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)  {  	if (p->sched_class && p->sched_class->set_cpus_allowed)  		p->sched_class->set_cpus_allowed(p, new_mask); -	else { -		cpumask_copy(&p->cpus_allowed, new_mask); -		p->rt.nr_cpus_allowed = cpumask_weight(new_mask); -	} + +	cpumask_copy(&p->cpus_allowed, new_mask); +	p->rt.nr_cpus_allowed = cpumask_weight(new_mask);  }  /* @@ -6130,7 +6253,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)  	if (task_cpu(p) != src_cpu)  		goto done;  	/* Affinity changed (again). */ -	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) +	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))  		goto fail;  	/* @@ -6211,6 +6334,30 @@ static void calc_global_load_remove(struct rq *rq)  	rq->calc_load_active = 0;  } +#ifdef CONFIG_CFS_BANDWIDTH +static void unthrottle_offline_cfs_rqs(struct rq *rq) +{ +	struct cfs_rq *cfs_rq; + +	for_each_leaf_cfs_rq(rq, cfs_rq) { +		struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + +		if (!cfs_rq->runtime_enabled) +			continue; + +		/* +		 * clock_task is not advancing so we just need to make sure +		 * there's some valid quota amount +		 */ +		cfs_rq->runtime_remaining = cfs_b->quota; +		if (cfs_rq_throttled(cfs_rq)) +			unthrottle_cfs_rq(cfs_rq); +	} +} +#else +static void unthrottle_offline_cfs_rqs(struct rq *rq) {} +#endif +  /*   * Migrate all tasks from the rq, sleeping tasks will be migrated by   * try_to_wake_up()->select_task_rq(). @@ -6236,6 +6383,9 @@ static void migrate_tasks(unsigned int dead_cpu)  	 */  	rq->stop = NULL; +	/* Ensure any throttled groups are reachable by pick_next_task */ +	unthrottle_offline_cfs_rqs(rq); +  	for ( ; ; ) {  		/*  		 * There's this thread running, bail when that's the only @@ -6937,8 +7087,6 @@ static int __init isolated_cpu_setup(char *str)  __setup("isolcpus=", isolated_cpu_setup); -#define SD_NODES_PER_DOMAIN 16 -  #ifdef CONFIG_NUMA  /** @@ -7443,6 +7591,7 @@ static void __sdt_free(const struct cpumask *cpu_map)  			struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);  			if (sd && (sd->flags & SD_OVERLAP))  				free_sched_groups(sd->groups, 0); +			kfree(*per_cpu_ptr(sdd->sd, j));  			kfree(*per_cpu_ptr(sdd->sg, j));  			kfree(*per_cpu_ptr(sdd->sgp, j));  		} @@ -7978,6 +8127,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,  	/* allow initial update_cfs_load() to truncate */  	cfs_rq->load_stamp = 1;  #endif +	init_cfs_rq_runtime(cfs_rq);  	tg->cfs_rq[cpu] = cfs_rq;  	tg->se[cpu] = se; @@ -8117,6 +8267,7 @@ void __init sched_init(void)  		 * We achieve this by letting root_task_group's tasks sit  		 * directly in rq->cfs (i.e root_task_group->se[] = NULL).  		 */ +		init_cfs_bandwidth(&root_task_group.cfs_bandwidth);  		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);  #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -8146,7 +8297,6 @@ void __init sched_init(void)  		rq_attach_root(rq, &def_root_domain);  #ifdef CONFIG_NO_HZ  		rq->nohz_balance_kick = 0; -		init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));  #endif  #endif  		init_rq_hrtick(rq); @@ -8188,8 +8338,6 @@ void __init sched_init(void)  	 */  	current->sched_class = &fair_sched_class; -	/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ -	zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);  #ifdef CONFIG_SMP  	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);  #ifdef CONFIG_NO_HZ @@ -8219,6 +8367,7 @@ void __might_sleep(const char *file, int line, int preempt_offset)  {  	static unsigned long prev_jiffy;	/* ratelimiting */ +	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */  	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||  	    system_state != SYSTEM_RUNNING || oops_in_progress)  		return; @@ -8358,6 +8507,8 @@ static void free_fair_sched_group(struct task_group *tg)  {  	int i; +	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); +  	for_each_possible_cpu(i) {  		if (tg->cfs_rq)  			kfree(tg->cfs_rq[i]); @@ -8385,6 +8536,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)  	tg->shares = NICE_0_LOAD; +	init_cfs_bandwidth(tg_cfs_bandwidth(tg)); +  	for_each_possible_cpu(i) {  		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),  				      GFP_KERNEL, cpu_to_node(i)); @@ -8660,12 +8813,7 @@ unsigned long sched_group_shares(struct task_group *tg)  }  #endif -#ifdef CONFIG_RT_GROUP_SCHED -/* - * Ensure that the real time constraints are schedulable. - */ -static DEFINE_MUTEX(rt_constraints_mutex); - +#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)  static unsigned long to_ratio(u64 period, u64 runtime)  {  	if (runtime == RUNTIME_INF) @@ -8673,6 +8821,13 @@ static unsigned long to_ratio(u64 period, u64 runtime)  	return div64_u64(runtime << 20, period);  } +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +/* + * Ensure that the real time constraints are schedulable. + */ +static DEFINE_MUTEX(rt_constraints_mutex);  /* Must be called with tasklist_lock held */  static inline int tg_has_rt_tasks(struct task_group *tg) @@ -8693,7 +8848,7 @@ struct rt_schedulable_data {  	u64 rt_runtime;  }; -static int tg_schedulable(struct task_group *tg, void *data) +static int tg_rt_schedulable(struct task_group *tg, void *data)  {  	struct rt_schedulable_data *d = data;  	struct task_group *child; @@ -8751,16 +8906,22 @@ static int tg_schedulable(struct task_group *tg, void *data)  static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)  { +	int ret; +  	struct rt_schedulable_data data = {  		.tg = tg,  		.rt_period = period,  		.rt_runtime = runtime,  	}; -	return walk_tg_tree(tg_schedulable, tg_nop, &data); +	rcu_read_lock(); +	ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); +	rcu_read_unlock(); + +	return ret;  } -static int tg_set_bandwidth(struct task_group *tg, +static int tg_set_rt_bandwidth(struct task_group *tg,  		u64 rt_period, u64 rt_runtime)  {  	int i, err = 0; @@ -8799,7 +8960,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)  	if (rt_runtime_us < 0)  		rt_runtime = RUNTIME_INF; -	return tg_set_bandwidth(tg, rt_period, rt_runtime); +	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);  }  long sched_group_rt_runtime(struct task_group *tg) @@ -8824,7 +8985,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)  	if (rt_period == 0)  		return -EINVAL; -	return tg_set_bandwidth(tg, rt_period, rt_runtime); +	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);  }  long sched_group_rt_period(struct task_group *tg) @@ -9014,6 +9175,238 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)  	return (u64) scale_load_down(tg->shares);  } + +#ifdef CONFIG_CFS_BANDWIDTH +static DEFINE_MUTEX(cfs_constraints_mutex); + +const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ +const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ + +static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); + +static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) +{ +	int i, ret = 0, runtime_enabled; +	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); + +	if (tg == &root_task_group) +		return -EINVAL; + +	/* +	 * Ensure we have at some amount of bandwidth every period.  This is +	 * to prevent reaching a state of large arrears when throttled via +	 * entity_tick() resulting in prolonged exit starvation. +	 */ +	if (quota < min_cfs_quota_period || period < min_cfs_quota_period) +		return -EINVAL; + +	/* +	 * Likewise, bound things on the otherside by preventing insane quota +	 * periods.  This also allows us to normalize in computing quota +	 * feasibility. +	 */ +	if (period > max_cfs_quota_period) +		return -EINVAL; + +	mutex_lock(&cfs_constraints_mutex); +	ret = __cfs_schedulable(tg, period, quota); +	if (ret) +		goto out_unlock; + +	runtime_enabled = quota != RUNTIME_INF; +	raw_spin_lock_irq(&cfs_b->lock); +	cfs_b->period = ns_to_ktime(period); +	cfs_b->quota = quota; + +	__refill_cfs_bandwidth_runtime(cfs_b); +	/* restart the period timer (if active) to handle new period expiry */ +	if (runtime_enabled && cfs_b->timer_active) { +		/* force a reprogram */ +		cfs_b->timer_active = 0; +		__start_cfs_bandwidth(cfs_b); +	} +	raw_spin_unlock_irq(&cfs_b->lock); + +	for_each_possible_cpu(i) { +		struct cfs_rq *cfs_rq = tg->cfs_rq[i]; +		struct rq *rq = rq_of(cfs_rq); + +		raw_spin_lock_irq(&rq->lock); +		cfs_rq->runtime_enabled = runtime_enabled; +		cfs_rq->runtime_remaining = 0; + +		if (cfs_rq_throttled(cfs_rq)) +			unthrottle_cfs_rq(cfs_rq); +		raw_spin_unlock_irq(&rq->lock); +	} +out_unlock: +	mutex_unlock(&cfs_constraints_mutex); + +	return ret; +} + +int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) +{ +	u64 quota, period; + +	period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); +	if (cfs_quota_us < 0) +		quota = RUNTIME_INF; +	else +		quota = (u64)cfs_quota_us * NSEC_PER_USEC; + +	return tg_set_cfs_bandwidth(tg, period, quota); +} + +long tg_get_cfs_quota(struct task_group *tg) +{ +	u64 quota_us; + +	if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) +		return -1; + +	quota_us = tg_cfs_bandwidth(tg)->quota; +	do_div(quota_us, NSEC_PER_USEC); + +	return quota_us; +} + +int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) +{ +	u64 quota, period; + +	period = (u64)cfs_period_us * NSEC_PER_USEC; +	quota = tg_cfs_bandwidth(tg)->quota; + +	if (period <= 0) +		return -EINVAL; + +	return tg_set_cfs_bandwidth(tg, period, quota); +} + +long tg_get_cfs_period(struct task_group *tg) +{ +	u64 cfs_period_us; + +	cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); +	do_div(cfs_period_us, NSEC_PER_USEC); + +	return cfs_period_us; +} + +static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) +{ +	return tg_get_cfs_quota(cgroup_tg(cgrp)); +} + +static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, +				s64 cfs_quota_us) +{ +	return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); +} + +static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) +{ +	return tg_get_cfs_period(cgroup_tg(cgrp)); +} + +static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, +				u64 cfs_period_us) +{ +	return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); +} + +struct cfs_schedulable_data { +	struct task_group *tg; +	u64 period, quota; +}; + +/* + * normalize group quota/period to be quota/max_period + * note: units are usecs + */ +static u64 normalize_cfs_quota(struct task_group *tg, +			       struct cfs_schedulable_data *d) +{ +	u64 quota, period; + +	if (tg == d->tg) { +		period = d->period; +		quota = d->quota; +	} else { +		period = tg_get_cfs_period(tg); +		quota = tg_get_cfs_quota(tg); +	} + +	/* note: these should typically be equivalent */ +	if (quota == RUNTIME_INF || quota == -1) +		return RUNTIME_INF; + +	return to_ratio(period, quota); +} + +static int tg_cfs_schedulable_down(struct task_group *tg, void *data) +{ +	struct cfs_schedulable_data *d = data; +	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); +	s64 quota = 0, parent_quota = -1; + +	if (!tg->parent) { +		quota = RUNTIME_INF; +	} else { +		struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); + +		quota = normalize_cfs_quota(tg, d); +		parent_quota = parent_b->hierarchal_quota; + +		/* +		 * ensure max(child_quota) <= parent_quota, inherit when no +		 * limit is set +		 */ +		if (quota == RUNTIME_INF) +			quota = parent_quota; +		else if (parent_quota != RUNTIME_INF && quota > parent_quota) +			return -EINVAL; +	} +	cfs_b->hierarchal_quota = quota; + +	return 0; +} + +static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) +{ +	int ret; +	struct cfs_schedulable_data data = { +		.tg = tg, +		.period = period, +		.quota = quota, +	}; + +	if (quota != RUNTIME_INF) { +		do_div(data.period, NSEC_PER_USEC); +		do_div(data.quota, NSEC_PER_USEC); +	} + +	rcu_read_lock(); +	ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); +	rcu_read_unlock(); + +	return ret; +} + +static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, +		struct cgroup_map_cb *cb) +{ +	struct task_group *tg = cgroup_tg(cgrp); +	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); + +	cb->fill(cb, "nr_periods", cfs_b->nr_periods); +	cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); +	cb->fill(cb, "throttled_time", cfs_b->throttled_time); + +	return 0; +} +#endif /* CONFIG_CFS_BANDWIDTH */  #endif /* CONFIG_FAIR_GROUP_SCHED */  #ifdef CONFIG_RT_GROUP_SCHED @@ -9048,6 +9441,22 @@ static struct cftype cpu_files[] = {  		.write_u64 = cpu_shares_write_u64,  	},  #endif +#ifdef CONFIG_CFS_BANDWIDTH +	{ +		.name = "cfs_quota_us", +		.read_s64 = cpu_cfs_quota_read_s64, +		.write_s64 = cpu_cfs_quota_write_s64, +	}, +	{ +		.name = "cfs_period_us", +		.read_u64 = cpu_cfs_period_read_u64, +		.write_u64 = cpu_cfs_period_write_u64, +	}, +	{ +		.name = "stat", +		.read_map = cpu_stats_show, +	}, +#endif  #ifdef CONFIG_RT_GROUP_SCHED  	{  		.name = "rt_runtime_us", @@ -9357,4 +9766,3 @@ struct cgroup_subsys cpuacct_subsys = {  	.subsys_id = cpuacct_subsys_id,  };  #endif	/* CONFIG_CGROUP_CPUACCT */ - | 
