diff options
Diffstat (limited to 'kernel/sched/core.c')
| -rw-r--r-- | kernel/sched/core.c | 249 | 
1 files changed, 200 insertions, 49 deletions
| diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 39eb6011bc38..d5594a4268d4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -142,9 +142,8 @@ const_debug unsigned int sysctl_sched_features =  #define SCHED_FEAT(name, enabled)	\  	#name , -static __read_mostly char *sched_feat_names[] = { +static const char * const sched_feat_names[] = {  #include "features.h" -	NULL  };  #undef SCHED_FEAT @@ -2517,25 +2516,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,  	sched_avg_update(this_rq);  } +#ifdef CONFIG_NO_HZ +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we cannot use the delta approach from the regular tick since that + * would seriously skew the load calculation. However we'll make do for those + * updates happening while idle (nohz_idle_balance) or coming out of idle + * (tick_nohz_idle_exit). + * + * This means we might still be one tick off for nohz periods. + */ +  /*   * Called from nohz_idle_balance() to update the load ratings before doing the   * idle balance.   */  void update_idle_cpu_load(struct rq *this_rq)  { -	unsigned long curr_jiffies = jiffies; +	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);  	unsigned long load = this_rq->load.weight;  	unsigned long pending_updates;  	/* -	 * Bloody broken means of dealing with nohz, but better than nothing.. -	 * jiffies is updated by one cpu, another cpu can drift wrt the jiffy -	 * update and see 0 difference the one time and 2 the next, even though -	 * we ticked at roughtly the same rate. -	 * -	 * Hence we only use this from nohz_idle_balance() and skip this -	 * nonsense when called from the scheduler_tick() since that's -	 * guaranteed a stable rate. +	 * bail if there's load or we're actually up-to-date.  	 */  	if (load || curr_jiffies == this_rq->last_load_update_tick)  		return; @@ -2547,12 +2553,38 @@ void update_idle_cpu_load(struct rq *this_rq)  }  /* + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + */ +void update_cpu_load_nohz(void) +{ +	struct rq *this_rq = this_rq(); +	unsigned long curr_jiffies = ACCESS_ONCE(jiffies); +	unsigned long pending_updates; + +	if (curr_jiffies == this_rq->last_load_update_tick) +		return; + +	raw_spin_lock(&this_rq->lock); +	pending_updates = curr_jiffies - this_rq->last_load_update_tick; +	if (pending_updates) { +		this_rq->last_load_update_tick = curr_jiffies; +		/* +		 * We were idle, this means load 0, the current load might be +		 * !0 due to remote wakeups and the sort. +		 */ +		__update_cpu_load(this_rq, 0, pending_updates); +	} +	raw_spin_unlock(&this_rq->lock); +} +#endif /* CONFIG_NO_HZ */ + +/*   * Called from scheduler_tick()   */  static void update_cpu_load_active(struct rq *this_rq)  {  	/* -	 * See the mess in update_idle_cpu_load(). +	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().  	 */  	this_rq->last_load_update_tick = jiffies;  	__update_cpu_load(this_rq, this_rq->load.weight, 1); @@ -4982,7 +5014,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)  		p->sched_class->set_cpus_allowed(p, new_mask);  	cpumask_copy(&p->cpus_allowed, new_mask); -	p->rt.nr_cpus_allowed = cpumask_weight(new_mask); +	p->nr_cpus_allowed = cpumask_weight(new_mask);  }  /* @@ -5524,15 +5556,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */  #ifdef CONFIG_SCHED_DEBUG -static __read_mostly int sched_domain_debug_enabled; +static __read_mostly int sched_debug_enabled; -static int __init sched_domain_debug_setup(char *str) +static int __init sched_debug_setup(char *str)  { -	sched_domain_debug_enabled = 1; +	sched_debug_enabled = 1;  	return 0;  } -early_param("sched_debug", sched_domain_debug_setup); +early_param("sched_debug", sched_debug_setup); + +static inline bool sched_debug(void) +{ +	return sched_debug_enabled; +}  static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,  				  struct cpumask *groupmask) @@ -5572,7 +5609,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,  			break;  		} -		if (!group->sgp->power) { +		/* +		 * Even though we initialize ->power to something semi-sane, +		 * we leave power_orig unset. This allows us to detect if +		 * domain iteration is still funny without causing /0 traps. +		 */ +		if (!group->sgp->power_orig) {  			printk(KERN_CONT "\n");  			printk(KERN_ERR "ERROR: domain->cpu_power not "  					"set\n"); @@ -5620,7 +5662,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)  {  	int level = 0; -	if (!sched_domain_debug_enabled) +	if (!sched_debug_enabled)  		return;  	if (!sd) { @@ -5641,6 +5683,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)  }  #else /* !CONFIG_SCHED_DEBUG */  # define sched_domain_debug(sd, cpu) do { } while (0) +static inline bool sched_debug(void) +{ +	return false; +}  #endif /* CONFIG_SCHED_DEBUG */  static int sd_degenerate(struct sched_domain *sd) @@ -5962,6 +6008,44 @@ struct sched_domain_topology_level {  	struct sd_data      data;  }; +/* + * Build an iteration mask that can exclude certain CPUs from the upwards + * domain traversal. + * + * Asymmetric node setups can result in situations where the domain tree is of + * unequal depth, make sure to skip domains that already cover the entire + * range. + * + * In that case build_sched_domains() will have terminated the iteration early + * and our sibling sd spans will be empty. Domains should always include the + * cpu they're built on, so check that. + * + */ +static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) +{ +	const struct cpumask *span = sched_domain_span(sd); +	struct sd_data *sdd = sd->private; +	struct sched_domain *sibling; +	int i; + +	for_each_cpu(i, span) { +		sibling = *per_cpu_ptr(sdd->sd, i); +		if (!cpumask_test_cpu(i, sched_domain_span(sibling))) +			continue; + +		cpumask_set_cpu(i, sched_group_mask(sg)); +	} +} + +/* + * Return the canonical balance cpu for this group, this is the first cpu + * of this group that's also in the iteration mask. + */ +int group_balance_cpu(struct sched_group *sg) +{ +	return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); +} +  static int  build_overlap_sched_groups(struct sched_domain *sd, int cpu)  { @@ -5980,6 +6064,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)  		if (cpumask_test_cpu(i, covered))  			continue; +		child = *per_cpu_ptr(sdd->sd, i); + +		/* See the comment near build_group_mask(). */ +		if (!cpumask_test_cpu(i, sched_domain_span(child))) +			continue; +  		sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),  				GFP_KERNEL, cpu_to_node(cpu)); @@ -5987,8 +6077,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)  			goto fail;  		sg_span = sched_group_cpus(sg); - -		child = *per_cpu_ptr(sdd->sd, i);  		if (child->child) {  			child = child->child;  			cpumask_copy(sg_span, sched_domain_span(child)); @@ -5997,10 +6085,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)  		cpumask_or(covered, covered, sg_span); -		sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); -		atomic_inc(&sg->sgp->ref); +		sg->sgp = *per_cpu_ptr(sdd->sgp, i); +		if (atomic_inc_return(&sg->sgp->ref) == 1) +			build_group_mask(sd, sg); -		if (cpumask_test_cpu(cpu, sg_span)) +		/* +		 * Initialize sgp->power such that even if we mess up the +		 * domains and no possible iteration will get us here, we won't +		 * die on a /0 trap. +		 */ +		sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); + +		/* +		 * Make sure the first group of this domain contains the +		 * canonical balance cpu. Otherwise the sched_domain iteration +		 * breaks. See update_sg_lb_stats(). +		 */ +		if ((!groups && cpumask_test_cpu(cpu, sg_span)) || +		    group_balance_cpu(sg) == cpu)  			groups = sg;  		if (!first) @@ -6074,6 +6176,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)  		cpumask_clear(sched_group_cpus(sg));  		sg->sgp->power = 0; +		cpumask_setall(sched_group_mask(sg));  		for_each_cpu(j, span) {  			if (get_group(j, sdd, NULL) != group) @@ -6115,7 +6218,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)  		sg = sg->next;  	} while (sg != sd->groups); -	if (cpu != group_first_cpu(sg)) +	if (cpu != group_balance_cpu(sg))  		return;  	update_group_power(sd, cpu); @@ -6165,11 +6268,8 @@ int sched_domain_level_max;  static int __init setup_relax_domain_level(char *str)  { -	unsigned long val; - -	val = simple_strtoul(str, NULL, 0); -	if (val < sched_domain_level_max) -		default_relax_domain_level = val; +	if (kstrtoint(str, 0, &default_relax_domain_level)) +		pr_warn("Unable to set relax_domain_level\n");  	return 1;  } @@ -6279,14 +6379,13 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol  #ifdef CONFIG_NUMA  static int sched_domains_numa_levels; -static int sched_domains_numa_scale;  static int *sched_domains_numa_distance;  static struct cpumask ***sched_domains_numa_masks;  static int sched_domains_curr_level;  static inline int sd_local_flags(int level)  { -	if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) +	if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)  		return 0;  	return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; @@ -6344,6 +6443,42 @@ static const struct cpumask *sd_numa_mask(int cpu)  	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];  } +static void sched_numa_warn(const char *str) +{ +	static int done = false; +	int i,j; + +	if (done) +		return; + +	done = true; + +	printk(KERN_WARNING "ERROR: %s\n\n", str); + +	for (i = 0; i < nr_node_ids; i++) { +		printk(KERN_WARNING "  "); +		for (j = 0; j < nr_node_ids; j++) +			printk(KERN_CONT "%02d ", node_distance(i,j)); +		printk(KERN_CONT "\n"); +	} +	printk(KERN_WARNING "\n"); +} + +static bool find_numa_distance(int distance) +{ +	int i; + +	if (distance == node_distance(0, 0)) +		return true; + +	for (i = 0; i < sched_domains_numa_levels; i++) { +		if (sched_domains_numa_distance[i] == distance) +			return true; +	} + +	return false; +} +  static void sched_init_numa(void)  {  	int next_distance, curr_distance = node_distance(0, 0); @@ -6351,7 +6486,6 @@ static void sched_init_numa(void)  	int level = 0;  	int i, j, k; -	sched_domains_numa_scale = curr_distance;  	sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);  	if (!sched_domains_numa_distance)  		return; @@ -6362,23 +6496,41 @@ static void sched_init_numa(void)  	 *  	 * Assumes node_distance(0,j) includes all distances in  	 * node_distance(i,j) in order to avoid cubic time. -	 * -	 * XXX: could be optimized to O(n log n) by using sort()  	 */  	next_distance = curr_distance;  	for (i = 0; i < nr_node_ids; i++) {  		for (j = 0; j < nr_node_ids; j++) { -			int distance = node_distance(0, j); -			if (distance > curr_distance && -					(distance < next_distance || -					 next_distance == curr_distance)) -				next_distance = distance; +			for (k = 0; k < nr_node_ids; k++) { +				int distance = node_distance(i, k); + +				if (distance > curr_distance && +				    (distance < next_distance || +				     next_distance == curr_distance)) +					next_distance = distance; + +				/* +				 * While not a strong assumption it would be nice to know +				 * about cases where if node A is connected to B, B is not +				 * equally connected to A. +				 */ +				if (sched_debug() && node_distance(k, i) != distance) +					sched_numa_warn("Node-distance not symmetric"); + +				if (sched_debug() && i && !find_numa_distance(distance)) +					sched_numa_warn("Node-0 not representative"); +			} +			if (next_distance != curr_distance) { +				sched_domains_numa_distance[level++] = next_distance; +				sched_domains_numa_levels = level; +				curr_distance = next_distance; +			} else break;  		} -		if (next_distance != curr_distance) { -			sched_domains_numa_distance[level++] = next_distance; -			sched_domains_numa_levels = level; -			curr_distance = next_distance; -		} else break; + +		/* +		 * In case of sched_debug() we verify the above assumption. +		 */ +		if (!sched_debug()) +			break;  	}  	/*  	 * 'level' contains the number of unique distances, excluding the @@ -6403,7 +6555,7 @@ static void sched_init_numa(void)  			return;  		for (j = 0; j < nr_node_ids; j++) { -			struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j); +			struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);  			if (!mask)  				return; @@ -6490,7 +6642,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)  			*per_cpu_ptr(sdd->sg, j) = sg; -			sgp = kzalloc_node(sizeof(struct sched_group_power), +			sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),  					GFP_KERNEL, cpu_to_node(j));  			if (!sgp)  				return -ENOMEM; @@ -6543,7 +6695,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,  	if (!sd)  		return child; -	set_domain_attribute(sd, attr);  	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));  	if (child) {  		sd->level = child->level + 1; @@ -6551,6 +6702,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,  		child->parent = sd;  	}  	sd->child = child; +	set_domain_attribute(sd, attr);  	return sd;  } @@ -6691,7 +6843,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)  	if (!doms_cur)  		doms_cur = &fallback_doms;  	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); -	dattr_cur = NULL;  	err = build_sched_domains(doms_cur[0], NULL);  	register_sched_domain_sysctl(); | 
