diff options
| author | Ingo Molnar <mingo@elte.hu> | 2008-06-16 11:17:50 +0200 | 
|---|---|---|
| committer | Ingo Molnar <mingo@elte.hu> | 2008-06-16 11:17:50 +0200 | 
| commit | 1791a78c0b10fe548bf08a2ed7f84a4ea1385430 (patch) | |
| tree | 3bc3e71a900a364aaaaf28d553ce44e5dc27092b /kernel/sched.c | |
| parent | bf07dc864902b3e788de5ab50dc62d5677da90f2 (diff) | |
| parent | 066519068ad2fbe98c7f45552b1f592903a9c8c8 (diff) | |
Merge branch 'linus' into x86/cleanups
Diffstat (limited to 'kernel/sched.c')
| -rw-r--r-- | kernel/sched.c | 469 | 
1 files changed, 55 insertions, 414 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index cfa222a91539..eaf6751e7612 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -136,7 +136,7 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)  static inline int rt_policy(int policy)  { -	if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) +	if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))  		return 1;  	return 0;  } @@ -312,12 +312,15 @@ static DEFINE_SPINLOCK(task_group_lock);  #endif  /* - * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems. + * A weight of 0 or 1 can cause arithmetics problems. + * A weight of a cfs_rq is the sum of weights of which entities + * are queued on this cfs_rq, so a weight of a entity should not be + * too large, so as the shares value of a task group.   * (The default weight is 1024 - so there's no practical   *  limitation from this.)   */  #define MIN_SHARES	2 -#define MAX_SHARES	(ULONG_MAX - 1) +#define MAX_SHARES	(1UL << 18)  static int init_task_group_load = INIT_TASK_GROUP_LOAD;  #endif @@ -398,43 +401,6 @@ struct cfs_rq {  	 */  	struct list_head leaf_cfs_rq_list;  	struct task_group *tg;	/* group that "owns" this runqueue */ - -#ifdef CONFIG_SMP -	unsigned long task_weight; -	unsigned long shares; -	/* -	 * We need space to build a sched_domain wide view of the full task -	 * group tree, in order to avoid depending on dynamic memory allocation -	 * during the load balancing we place this in the per cpu task group -	 * hierarchy. This limits the load balancing to one instance per cpu, -	 * but more should not be needed anyway. -	 */ -	struct aggregate_struct { -		/* -		 *   load = weight(cpus) * f(tg) -		 * -		 * Where f(tg) is the recursive weight fraction assigned to -		 * this group. -		 */ -		unsigned long load; - -		/* -		 * part of the group weight distributed to this span. -		 */ -		unsigned long shares; - -		/* -		 * The sum of all runqueue weights within this span. -		 */ -		unsigned long rq_weight; - -		/* -		 * Weight contributed by tasks; this is the part we can -		 * influence by moving tasks around. -		 */ -		unsigned long task_weight; -	} aggregate; -#endif  #endif  }; @@ -1368,17 +1334,19 @@ static void __resched_task(struct task_struct *p, int tif_bit)   */  #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) -/* - * delta *= weight / lw - */  static unsigned long  calc_delta_mine(unsigned long delta_exec, unsigned long weight,  		struct load_weight *lw)  {  	u64 tmp; -	if (!lw->inv_weight) -		lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)/(lw->weight+1); +	if (!lw->inv_weight) { +		if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) +			lw->inv_weight = 1; +		else +			lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) +				/ (lw->weight+1); +	}  	tmp = (u64)delta_exec * weight;  	/* @@ -1393,6 +1361,12 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,  	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);  } +static inline unsigned long +calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) +{ +	return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); +} +  static inline void update_load_add(struct load_weight *lw, unsigned long inc)  {  	lw->weight += inc; @@ -1505,326 +1479,6 @@ static unsigned long source_load(int cpu, int type);  static unsigned long target_load(int cpu, int type);  static unsigned long cpu_avg_load_per_task(int cpu);  static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); - -#ifdef CONFIG_FAIR_GROUP_SCHED - -/* - * Group load balancing. - * - * We calculate a few balance domain wide aggregate numbers; load and weight. - * Given the pictures below, and assuming each item has equal weight: - * - *         root          1 - thread - *         / | \         A - group - *        A  1  B - *       /|\   / \ - *      C 2 D 3   4 - *      |   | - *      5   6 - * - * load: - *    A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd, - *    which equals 1/9-th of the total load. - * - * shares: - *    The weight of this group on the selected cpus. - * - * rq_weight: - *    Direct sum of all the cpu's their rq weight, e.g. A would get 3 while - *    B would get 2. - * - * task_weight: - *    Part of the rq_weight contributed by tasks; all groups except B would - *    get 1, B gets 2. - */ - -static inline struct aggregate_struct * -aggregate(struct task_group *tg, struct sched_domain *sd) -{ -	return &tg->cfs_rq[sd->first_cpu]->aggregate; -} - -typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); - -/* - * Iterate the full tree, calling @down when first entering a node and @up when - * leaving it for the final time. - */ -static -void aggregate_walk_tree(aggregate_func down, aggregate_func up, -			 struct sched_domain *sd) -{ -	struct task_group *parent, *child; - -	rcu_read_lock(); -	parent = &root_task_group; -down: -	(*down)(parent, sd); -	list_for_each_entry_rcu(child, &parent->children, siblings) { -		parent = child; -		goto down; - -up: -		continue; -	} -	(*up)(parent, sd); - -	child = parent; -	parent = parent->parent; -	if (parent) -		goto up; -	rcu_read_unlock(); -} - -/* - * Calculate the aggregate runqueue weight. - */ -static -void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) -{ -	unsigned long rq_weight = 0; -	unsigned long task_weight = 0; -	int i; - -	for_each_cpu_mask(i, sd->span) { -		rq_weight += tg->cfs_rq[i]->load.weight; -		task_weight += tg->cfs_rq[i]->task_weight; -	} - -	aggregate(tg, sd)->rq_weight = rq_weight; -	aggregate(tg, sd)->task_weight = task_weight; -} - -/* - * Compute the weight of this group on the given cpus. - */ -static -void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) -{ -	unsigned long shares = 0; -	int i; - -	for_each_cpu_mask(i, sd->span) -		shares += tg->cfs_rq[i]->shares; - -	if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares) -		shares = tg->shares; - -	aggregate(tg, sd)->shares = shares; -} - -/* - * Compute the load fraction assigned to this group, relies on the aggregate - * weight and this group's parent's load, i.e. top-down. - */ -static -void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) -{ -	unsigned long load; - -	if (!tg->parent) { -		int i; - -		load = 0; -		for_each_cpu_mask(i, sd->span) -			load += cpu_rq(i)->load.weight; - -	} else { -		load = aggregate(tg->parent, sd)->load; - -		/* -		 * shares is our weight in the parent's rq so -		 * shares/parent->rq_weight gives our fraction of the load -		 */ -		load *= aggregate(tg, sd)->shares; -		load /= aggregate(tg->parent, sd)->rq_weight + 1; -	} - -	aggregate(tg, sd)->load = load; -} - -static void __set_se_shares(struct sched_entity *se, unsigned long shares); - -/* - * Calculate and set the cpu's group shares. - */ -static void -__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, -			  int tcpu) -{ -	int boost = 0; -	unsigned long shares; -	unsigned long rq_weight; - -	if (!tg->se[tcpu]) -		return; - -	rq_weight = tg->cfs_rq[tcpu]->load.weight; - -	/* -	 * If there are currently no tasks on the cpu pretend there is one of -	 * average load so that when a new task gets to run here it will not -	 * get delayed by group starvation. -	 */ -	if (!rq_weight) { -		boost = 1; -		rq_weight = NICE_0_LOAD; -	} - -	/* -	 *           \Sum shares * rq_weight -	 * shares =  ----------------------- -	 *               \Sum rq_weight -	 * -	 */ -	shares = aggregate(tg, sd)->shares * rq_weight; -	shares /= aggregate(tg, sd)->rq_weight + 1; - -	/* -	 * record the actual number of shares, not the boosted amount. -	 */ -	tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; - -	if (shares < MIN_SHARES) -		shares = MIN_SHARES; -	else if (shares > MAX_SHARES) -		shares = MAX_SHARES; - -	__set_se_shares(tg->se[tcpu], shares); -} - -/* - * Re-adjust the weights on the cpu the task came from and on the cpu the - * task went to. - */ -static void -__move_group_shares(struct task_group *tg, struct sched_domain *sd, -		    int scpu, int dcpu) -{ -	unsigned long shares; - -	shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; - -	__update_group_shares_cpu(tg, sd, scpu); -	__update_group_shares_cpu(tg, sd, dcpu); - -	/* -	 * ensure we never loose shares due to rounding errors in the -	 * above redistribution. -	 */ -	shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; -	if (shares) -		tg->cfs_rq[dcpu]->shares += shares; -} - -/* - * Because changing a group's shares changes the weight of the super-group - * we need to walk up the tree and change all shares until we hit the root. - */ -static void -move_group_shares(struct task_group *tg, struct sched_domain *sd, -		  int scpu, int dcpu) -{ -	while (tg) { -		__move_group_shares(tg, sd, scpu, dcpu); -		tg = tg->parent; -	} -} - -static -void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) -{ -	unsigned long shares = aggregate(tg, sd)->shares; -	int i; - -	for_each_cpu_mask(i, sd->span) { -		struct rq *rq = cpu_rq(i); -		unsigned long flags; - -		spin_lock_irqsave(&rq->lock, flags); -		__update_group_shares_cpu(tg, sd, i); -		spin_unlock_irqrestore(&rq->lock, flags); -	} - -	aggregate_group_shares(tg, sd); - -	/* -	 * ensure we never loose shares due to rounding errors in the -	 * above redistribution. -	 */ -	shares -= aggregate(tg, sd)->shares; -	if (shares) { -		tg->cfs_rq[sd->first_cpu]->shares += shares; -		aggregate(tg, sd)->shares += shares; -	} -} - -/* - * Calculate the accumulative weight and recursive load of each task group - * while walking down the tree. - */ -static -void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) -{ -	aggregate_group_weight(tg, sd); -	aggregate_group_shares(tg, sd); -	aggregate_group_load(tg, sd); -} - -/* - * Rebalance the cpu shares while walking back up the tree. - */ -static -void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) -{ -	aggregate_group_set_shares(tg, sd); -} - -static DEFINE_PER_CPU(spinlock_t, aggregate_lock); - -static void __init init_aggregate(void) -{ -	int i; - -	for_each_possible_cpu(i) -		spin_lock_init(&per_cpu(aggregate_lock, i)); -} - -static int get_aggregate(struct sched_domain *sd) -{ -	if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) -		return 0; - -	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); -	return 1; -} - -static void put_aggregate(struct sched_domain *sd) -{ -	spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); -} - -static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) -{ -	cfs_rq->shares = shares; -} - -#else - -static inline void init_aggregate(void) -{ -} - -static inline int get_aggregate(struct sched_domain *sd) -{ -	return 0; -} - -static inline void put_aggregate(struct sched_domain *sd) -{ -} -#endif -  #else /* CONFIG_SMP */  #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1845,14 +1499,26 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)  #define sched_class_highest (&rt_sched_class) -static void inc_nr_running(struct rq *rq) +static inline void inc_load(struct rq *rq, const struct task_struct *p) +{ +	update_load_add(&rq->load, p->se.load.weight); +} + +static inline void dec_load(struct rq *rq, const struct task_struct *p) +{ +	update_load_sub(&rq->load, p->se.load.weight); +} + +static void inc_nr_running(struct task_struct *p, struct rq *rq)  {  	rq->nr_running++; +	inc_load(rq, p);  } -static void dec_nr_running(struct rq *rq) +static void dec_nr_running(struct task_struct *p, struct rq *rq)  {  	rq->nr_running--; +	dec_load(rq, p);  }  static void set_load_weight(struct task_struct *p) @@ -1944,7 +1610,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)  		rq->nr_uninterruptible--;  	enqueue_task(rq, p, wakeup); -	inc_nr_running(rq); +	inc_nr_running(p, rq);  }  /* @@ -1956,7 +1622,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)  		rq->nr_uninterruptible++;  	dequeue_task(rq, p, sleep); -	dec_nr_running(rq); +	dec_nr_running(p, rq);  }  /** @@ -2609,7 +2275,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)  		 * management (if any):  		 */  		p->sched_class->task_new(rq, p); -		inc_nr_running(rq); +		inc_nr_running(p, rq);  	}  	check_preempt_curr(rq, p);  #ifdef CONFIG_SMP @@ -3600,12 +3266,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,  	unsigned long imbalance;  	struct rq *busiest;  	unsigned long flags; -	int unlock_aggregate;  	cpus_setall(*cpus); -	unlock_aggregate = get_aggregate(sd); -  	/*  	 * When power savings policy is enabled for the parent domain, idle  	 * sibling can pick up load irrespective of busy siblings. In this case, @@ -3721,9 +3384,8 @@ redo:  	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&  	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) -		ld_moved = -1; - -	goto out; +		return -1; +	return ld_moved;  out_balanced:  	schedstat_inc(sd, lb_balanced[idle]); @@ -3738,13 +3400,8 @@ out_one_pinned:  	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&  	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) -		ld_moved = -1; -	else -		ld_moved = 0; -out: -	if (unlock_aggregate) -		put_aggregate(sd); -	return ld_moved; +		return -1; +	return 0;  }  /* @@ -4430,7 +4087,7 @@ static inline void schedule_debug(struct task_struct *prev)  	 * schedule() atomically, we ignore that path for now.  	 * Otherwise, whine if we are scheduling when we should not be.  	 */ -	if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state)) +	if (unlikely(in_atomic_preempt_off() && !prev->exit_state))  		__schedule_bug(prev);  	profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -4510,12 +4167,10 @@ need_resched_nonpreemptible:  	clear_tsk_need_resched(prev);  	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { -		if (unlikely((prev->state & TASK_INTERRUPTIBLE) && -				signal_pending(prev))) { +		if (unlikely(signal_pending_state(prev->state, prev)))  			prev->state = TASK_RUNNING; -		} else { +		else  			deactivate_task(rq, prev, 1); -		}  		switch_count = &prev->nvcsw;  	} @@ -4931,8 +4586,10 @@ void set_user_nice(struct task_struct *p, long nice)  		goto out_unlock;  	}  	on_rq = p->se.on_rq; -	if (on_rq) +	if (on_rq) {  		dequeue_task(rq, p, 0); +		dec_load(rq, p); +	}  	p->static_prio = NICE_TO_PRIO(nice);  	set_load_weight(p); @@ -4942,6 +4599,7 @@ void set_user_nice(struct task_struct *p, long nice)  	if (on_rq) {  		enqueue_task(rq, p, 0); +		inc_load(rq, p);  		/*  		 * If the task increased its priority or is running and  		 * lowered its priority, then reschedule its CPU: @@ -7316,7 +6974,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,  			SD_INIT(sd, ALLNODES);  			set_domain_attribute(sd, attr);  			sd->span = *cpu_map; -			sd->first_cpu = first_cpu(sd->span);  			cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);  			p = sd;  			sd_allnodes = 1; @@ -7327,7 +6984,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,  		SD_INIT(sd, NODE);  		set_domain_attribute(sd, attr);  		sched_domain_node_span(cpu_to_node(i), &sd->span); -		sd->first_cpu = first_cpu(sd->span);  		sd->parent = p;  		if (p)  			p->child = sd; @@ -7339,7 +6995,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,  		SD_INIT(sd, CPU);  		set_domain_attribute(sd, attr);  		sd->span = *nodemask; -		sd->first_cpu = first_cpu(sd->span);  		sd->parent = p;  		if (p)  			p->child = sd; @@ -7351,7 +7006,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,  		SD_INIT(sd, MC);  		set_domain_attribute(sd, attr);  		sd->span = cpu_coregroup_map(i); -		sd->first_cpu = first_cpu(sd->span);  		cpus_and(sd->span, sd->span, *cpu_map);  		sd->parent = p;  		p->child = sd; @@ -7364,7 +7018,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,  		SD_INIT(sd, SIBLING);  		set_domain_attribute(sd, attr);  		sd->span = per_cpu(cpu_sibling_map, i); -		sd->first_cpu = first_cpu(sd->span);  		cpus_and(sd->span, sd->span, *cpu_map);  		sd->parent = p;  		p->child = sd; @@ -7568,8 +7221,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)  static cpumask_t *doms_cur;	/* current sched domains */  static int ndoms_cur;		/* number of sched domains in 'doms_cur' */ -static struct sched_domain_attr *dattr_cur;	/* attribues of custom domains -						   in 'doms_cur' */ +static struct sched_domain_attr *dattr_cur; +				/* attribues of custom domains in 'doms_cur' */  /*   * Special case: If a kmalloc of a doms_cur partition (array of @@ -8034,7 +7687,6 @@ void __init sched_init(void)  	}  #ifdef CONFIG_SMP -	init_aggregate();  	init_defrootdomain();  #endif @@ -8599,11 +8251,14 @@ void sched_move_task(struct task_struct *tsk)  #endif  #ifdef CONFIG_FAIR_GROUP_SCHED -static void __set_se_shares(struct sched_entity *se, unsigned long shares) +static void set_se_shares(struct sched_entity *se, unsigned long shares)  {  	struct cfs_rq *cfs_rq = se->cfs_rq; +	struct rq *rq = cfs_rq->rq;  	int on_rq; +	spin_lock_irq(&rq->lock); +  	on_rq = se->on_rq;  	if (on_rq)  		dequeue_entity(cfs_rq, se, 0); @@ -8613,17 +8268,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)  	if (on_rq)  		enqueue_entity(cfs_rq, se, 0); -} -static void set_se_shares(struct sched_entity *se, unsigned long shares) -{ -	struct cfs_rq *cfs_rq = se->cfs_rq; -	struct rq *rq = cfs_rq->rq; -	unsigned long flags; - -	spin_lock_irqsave(&rq->lock, flags); -	__set_se_shares(se, shares); -	spin_unlock_irqrestore(&rq->lock, flags); +	spin_unlock_irq(&rq->lock);  }  static DEFINE_MUTEX(shares_mutex); @@ -8662,13 +8308,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)  	 * w/o tripping rebalance_share or load_balance_fair.  	 */  	tg->shares = shares; -	for_each_possible_cpu(i) { -		/* -		 * force a rebalance -		 */ -		cfs_rq_set_shares(tg->cfs_rq[i], 0); +	for_each_possible_cpu(i)  		set_se_shares(tg->se[i], shares); -	}  	/*  	 * Enable load balance activity on this group, by inserting it back on  | 
