diff options
Diffstat (limited to 'kernel/sched.c')
| -rw-r--r-- | kernel/sched.c | 1048 | 
1 files changed, 642 insertions, 406 deletions
| diff --git a/kernel/sched.c b/kernel/sched.c index 76080d142e3d..a07cff90d849 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -166,7 +166,7 @@  #define SCALE_PRIO(x, prio) \  	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) -static inline unsigned int task_timeslice(task_t *p) +static unsigned int task_timeslice(task_t *p)  {  	if (p->static_prio < NICE_TO_PRIO(0))  		return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); @@ -206,7 +206,7 @@ struct runqueue {  	 */  	unsigned long nr_running;  #ifdef CONFIG_SMP -	unsigned long cpu_load; +	unsigned long cpu_load[3];  #endif  	unsigned long long nr_switches; @@ -260,22 +260,86 @@ struct runqueue {  static DEFINE_PER_CPU(struct runqueue, runqueues); +/* + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. + * See detach_destroy_domains: synchronize_sched for details. + * + * The domain tree of any CPU may only be accessed from within + * preempt-disabled sections. + */  #define for_each_domain(cpu, domain) \ -	for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) +for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)  #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))  #define this_rq()		(&__get_cpu_var(runqueues))  #define task_rq(p)		cpu_rq(task_cpu(p))  #define cpu_curr(cpu)		(cpu_rq(cpu)->curr) -/* - * Default context-switch locking: - */  #ifndef prepare_arch_switch -# define prepare_arch_switch(rq, next)	do { } while (0) -# define finish_arch_switch(rq, next)	spin_unlock_irq(&(rq)->lock) -# define task_running(rq, p)		((rq)->curr == (p)) +# define prepare_arch_switch(next)	do { } while (0) +#endif +#ifndef finish_arch_switch +# define finish_arch_switch(prev)	do { } while (0) +#endif + +#ifndef __ARCH_WANT_UNLOCKED_CTXSW +static inline int task_running(runqueue_t *rq, task_t *p) +{ +	return rq->curr == p; +} + +static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) +{ +} + +static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) +{ +	spin_unlock_irq(&rq->lock); +} + +#else /* __ARCH_WANT_UNLOCKED_CTXSW */ +static inline int task_running(runqueue_t *rq, task_t *p) +{ +#ifdef CONFIG_SMP +	return p->oncpu; +#else +	return rq->curr == p; +#endif +} + +static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) +{ +#ifdef CONFIG_SMP +	/* +	 * We can optimise this out completely for !SMP, because the +	 * SMP rebalancing from interrupt is the only thing that cares +	 * here. +	 */ +	next->oncpu = 1; +#endif +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW +	spin_unlock_irq(&rq->lock); +#else +	spin_unlock(&rq->lock); +#endif +} + +static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) +{ +#ifdef CONFIG_SMP +	/* +	 * After ->oncpu is cleared, the task can be moved to a different CPU. +	 * We must ensure this doesn't happen until the switch is completely +	 * finished. +	 */ +	smp_wmb(); +	prev->oncpu = 0;  #endif +#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW +	local_irq_enable(); +#endif +} +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */  /*   * task_rq_lock - lock the runqueue a given task resides on and disable @@ -309,7 +373,7 @@ static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)   * bump this up when changing the output format or the meaning of an existing   * format, so that tools can adapt (or abort)   */ -#define SCHEDSTAT_VERSION 11 +#define SCHEDSTAT_VERSION 12  static int show_schedstat(struct seq_file *seq, void *v)  { @@ -338,6 +402,7 @@ static int show_schedstat(struct seq_file *seq, void *v)  #ifdef CONFIG_SMP  		/* domain-specific stats */ +		preempt_disable();  		for_each_domain(cpu, sd) {  			enum idle_type itype;  			char mask_str[NR_CPUS]; @@ -356,11 +421,13 @@ static int show_schedstat(struct seq_file *seq, void *v)  				    sd->lb_nobusyq[itype],  				    sd->lb_nobusyg[itype]);  			} -			seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n", +			seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",  			    sd->alb_cnt, sd->alb_failed, sd->alb_pushed, -			    sd->sbe_pushed, sd->sbe_attempts, +			    sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, +			    sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,  			    sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);  		} +		preempt_enable();  #endif  	}  	return 0; @@ -414,22 +481,6 @@ static inline runqueue_t *this_rq_lock(void)  	return rq;  } -#ifdef CONFIG_SCHED_SMT -static int cpu_and_siblings_are_idle(int cpu) -{ -	int sib; -	for_each_cpu_mask(sib, cpu_sibling_map[cpu]) { -		if (idle_cpu(sib)) -			continue; -		return 0; -	} - -	return 1; -} -#else -#define cpu_and_siblings_are_idle(A) idle_cpu(A) -#endif -  #ifdef CONFIG_SCHEDSTATS  /*   * Called when a process is dequeued from the active array and given @@ -622,7 +673,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)  	rq->nr_running++;  } -static void recalc_task_prio(task_t *p, unsigned long long now) +static int recalc_task_prio(task_t *p, unsigned long long now)  {  	/* Caller must always ensure 'now >= p->timestamp' */  	unsigned long long __sleep_time = now - p->timestamp; @@ -681,7 +732,7 @@ static void recalc_task_prio(task_t *p, unsigned long long now)  		}  	} -	p->prio = effective_prio(p); +	return effective_prio(p);  }  /* @@ -704,7 +755,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)  	}  #endif -	recalc_task_prio(p, now); +	p->prio = recalc_task_prio(p, now);  	/*  	 * This checks to make sure it's not an uninterruptible task @@ -782,22 +833,12 @@ inline int task_curr(const task_t *p)  }  #ifdef CONFIG_SMP -enum request_type { -	REQ_MOVE_TASK, -	REQ_SET_DOMAIN, -}; -  typedef struct {  	struct list_head list; -	enum request_type type; -	/* For REQ_MOVE_TASK */  	task_t *task;  	int dest_cpu; -	/* For REQ_SET_DOMAIN */ -	struct sched_domain *sd; -  	struct completion done;  } migration_req_t; @@ -819,7 +860,6 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)  	}  	init_completion(&req->done); -	req->type = REQ_MOVE_TASK;  	req->task = p;  	req->dest_cpu = dest_cpu;  	list_add(&req->list, &rq->migration_queue); @@ -886,26 +926,154 @@ void kick_process(task_t *p)   * We want to under-estimate the load of migration sources, to   * balance conservatively.   */ -static inline unsigned long source_load(int cpu) +static inline unsigned long source_load(int cpu, int type)  {  	runqueue_t *rq = cpu_rq(cpu);  	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; +	if (type == 0) +		return load_now; -	return min(rq->cpu_load, load_now); +	return min(rq->cpu_load[type-1], load_now);  }  /*   * Return a high guess at the load of a migration-target cpu   */ -static inline unsigned long target_load(int cpu) +static inline unsigned long target_load(int cpu, int type)  {  	runqueue_t *rq = cpu_rq(cpu);  	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; +	if (type == 0) +		return load_now; -	return max(rq->cpu_load, load_now); +	return max(rq->cpu_load[type-1], load_now);  } -#endif +/* + * find_idlest_group finds and returns the least busy CPU group within the + * domain. + */ +static struct sched_group * +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) +{ +	struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; +	unsigned long min_load = ULONG_MAX, this_load = 0; +	int load_idx = sd->forkexec_idx; +	int imbalance = 100 + (sd->imbalance_pct-100)/2; + +	do { +		unsigned long load, avg_load; +		int local_group; +		int i; + +		local_group = cpu_isset(this_cpu, group->cpumask); +		/* XXX: put a cpus allowed check */ + +		/* Tally up the load of all CPUs in the group */ +		avg_load = 0; + +		for_each_cpu_mask(i, group->cpumask) { +			/* Bias balancing toward cpus of our domain */ +			if (local_group) +				load = source_load(i, load_idx); +			else +				load = target_load(i, load_idx); + +			avg_load += load; +		} + +		/* Adjust by relative CPU power of the group */ +		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + +		if (local_group) { +			this_load = avg_load; +			this = group; +		} else if (avg_load < min_load) { +			min_load = avg_load; +			idlest = group; +		} +		group = group->next; +	} while (group != sd->groups); + +	if (!idlest || 100*this_load < imbalance*min_load) +		return NULL; +	return idlest; +} + +/* + * find_idlest_queue - find the idlest runqueue among the cpus in group. + */ +static int find_idlest_cpu(struct sched_group *group, int this_cpu) +{ +	unsigned long load, min_load = ULONG_MAX; +	int idlest = -1; +	int i; + +	for_each_cpu_mask(i, group->cpumask) { +		load = source_load(i, 0); + +		if (load < min_load || (load == min_load && i == this_cpu)) { +			min_load = load; +			idlest = i; +		} +	} + +	return idlest; +} + +/* + * sched_balance_self: balance the current task (running on cpu) in domains + * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and + * SD_BALANCE_EXEC. + * + * Balance, ie. select the least loaded group. + * + * Returns the target CPU number, or the same CPU if no balancing is needed. + * + * preempt must be disabled. + */ +static int sched_balance_self(int cpu, int flag) +{ +	struct task_struct *t = current; +	struct sched_domain *tmp, *sd = NULL; + +	for_each_domain(cpu, tmp) +		if (tmp->flags & flag) +			sd = tmp; + +	while (sd) { +		cpumask_t span; +		struct sched_group *group; +		int new_cpu; +		int weight; + +		span = sd->span; +		group = find_idlest_group(sd, t, cpu); +		if (!group) +			goto nextlevel; + +		new_cpu = find_idlest_cpu(group, cpu); +		if (new_cpu == -1 || new_cpu == cpu) +			goto nextlevel; + +		/* Now try balancing at a lower domain level */ +		cpu = new_cpu; +nextlevel: +		sd = NULL; +		weight = cpus_weight(span); +		for_each_domain(cpu, tmp) { +			if (weight <= cpus_weight(tmp->span)) +				break; +			if (tmp->flags & flag) +				sd = tmp; +		} +		/* while loop will break here if sd == NULL */ +	} + +	return cpu; +} + +#endif /* CONFIG_SMP */  /*   * wake_idle() will wake a task on an idle cpu if task->cpu is @@ -927,14 +1095,14 @@ static int wake_idle(int cpu, task_t *p)  	for_each_domain(cpu, sd) {  		if (sd->flags & SD_WAKE_IDLE) { -			cpus_and(tmp, sd->span, cpu_online_map); -			cpus_and(tmp, tmp, p->cpus_allowed); +			cpus_and(tmp, sd->span, p->cpus_allowed);  			for_each_cpu_mask(i, tmp) {  				if (idle_cpu(i))  					return i;  			}  		} -		else break; +		else +			break;  	}  	return cpu;  } @@ -967,7 +1135,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)  	runqueue_t *rq;  #ifdef CONFIG_SMP  	unsigned long load, this_load; -	struct sched_domain *sd; +	struct sched_domain *sd, *this_sd = NULL;  	int new_cpu;  #endif @@ -986,70 +1154,69 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)  	if (unlikely(task_running(rq, p)))  		goto out_activate; -#ifdef CONFIG_SCHEDSTATS +	new_cpu = cpu; +  	schedstat_inc(rq, ttwu_cnt);  	if (cpu == this_cpu) {  		schedstat_inc(rq, ttwu_local); -	} else { -		for_each_domain(this_cpu, sd) { -			if (cpu_isset(cpu, sd->span)) { -				schedstat_inc(sd, ttwu_wake_remote); -				break; -			} +		goto out_set_cpu; +	} + +	for_each_domain(this_cpu, sd) { +		if (cpu_isset(cpu, sd->span)) { +			schedstat_inc(sd, ttwu_wake_remote); +			this_sd = sd; +			break;  		}  	} -#endif -	new_cpu = cpu; -	if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) +	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))  		goto out_set_cpu; -	load = source_load(cpu); -	this_load = target_load(this_cpu); -  	/* -	 * If sync wakeup then subtract the (maximum possible) effect of -	 * the currently running task from the load of the current CPU: +	 * Check for affine wakeup and passive balancing possibilities.  	 */ -	if (sync) -		this_load -= SCHED_LOAD_SCALE; +	if (this_sd) { +		int idx = this_sd->wake_idx; +		unsigned int imbalance; -	/* Don't pull the task off an idle CPU to a busy one */ -	if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2) -		goto out_set_cpu; +		imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; -	new_cpu = this_cpu; /* Wake to this CPU if we can */ +		load = source_load(cpu, idx); +		this_load = target_load(this_cpu, idx); -	/* -	 * Scan domains for affine wakeup and passive balancing -	 * possibilities. -	 */ -	for_each_domain(this_cpu, sd) { -		unsigned int imbalance; -		/* -		 * Start passive balancing when half the imbalance_pct -		 * limit is reached. -		 */ -		imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2; +		new_cpu = this_cpu; /* Wake to this CPU if we can */ -		if ((sd->flags & SD_WAKE_AFFINE) && -				!task_hot(p, rq->timestamp_last_tick, sd)) { +		if (this_sd->flags & SD_WAKE_AFFINE) { +			unsigned long tl = this_load;  			/* -			 * This domain has SD_WAKE_AFFINE and p is cache cold -			 * in this domain. +			 * If sync wakeup then subtract the (maximum possible) +			 * effect of the currently running task from the load +			 * of the current CPU:  			 */ -			if (cpu_isset(cpu, sd->span)) { -				schedstat_inc(sd, ttwu_move_affine); +			if (sync) +				tl -= SCHED_LOAD_SCALE; + +			if ((tl <= load && +				tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || +				100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { +				/* +				 * This domain has SD_WAKE_AFFINE and +				 * p is cache cold in this domain, and +				 * there is no bad imbalance. +				 */ +				schedstat_inc(this_sd, ttwu_move_affine);  				goto out_set_cpu;  			} -		} else if ((sd->flags & SD_WAKE_BALANCE) && -				imbalance*this_load <= 100*load) { -			/* -			 * This domain has SD_WAKE_BALANCE and there is -			 * an imbalance. -			 */ -			if (cpu_isset(cpu, sd->span)) { -				schedstat_inc(sd, ttwu_move_balance); +		} + +		/* +		 * Start passive balancing when half the imbalance_pct +		 * limit is reached. +		 */ +		if (this_sd->flags & SD_WAKE_BALANCE) { +			if (imbalance*this_load <= 100*load) { +				schedstat_inc(this_sd, ttwu_move_balance);  				goto out_set_cpu;  			}  		} @@ -1120,17 +1287,19 @@ int fastcall wake_up_state(task_t *p, unsigned int state)  	return try_to_wake_up(p, state, 0);  } -#ifdef CONFIG_SMP -static int find_idlest_cpu(struct task_struct *p, int this_cpu, -			   struct sched_domain *sd); -#endif -  /*   * Perform scheduler related setup for a newly forked process p.   * p is forked by current.   */ -void fastcall sched_fork(task_t *p) +void fastcall sched_fork(task_t *p, int clone_flags)  { +	int cpu = get_cpu(); + +#ifdef CONFIG_SMP +	cpu = sched_balance_self(cpu, SD_BALANCE_FORK); +#endif +	set_task_cpu(p, cpu); +  	/*  	 * We mark the process as running here, but have not actually  	 * inserted it onto the runqueue yet. This guarantees that @@ -1140,17 +1309,14 @@ void fastcall sched_fork(task_t *p)  	p->state = TASK_RUNNING;  	INIT_LIST_HEAD(&p->run_list);  	p->array = NULL; -	spin_lock_init(&p->switch_lock);  #ifdef CONFIG_SCHEDSTATS  	memset(&p->sched_info, 0, sizeof(p->sched_info));  #endif +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) +	p->oncpu = 0; +#endif  #ifdef CONFIG_PREEMPT -	/* -	 * During context-switch we hold precisely one spinlock, which -	 * schedule_tail drops. (in the common case it's this_rq()->lock, -	 * but it also can be p->switch_lock.) So we compensate with a count -	 * of 1. Also, we want to start with kernel preemption disabled. -	 */ +	/* Want to start with kernel preemption disabled. */  	p->thread_info->preempt_count = 1;  #endif  	/* @@ -1174,12 +1340,10 @@ void fastcall sched_fork(task_t *p)  		 * runqueue lock is not a problem.  		 */  		current->time_slice = 1; -		preempt_disable();  		scheduler_tick(); -		local_irq_enable(); -		preempt_enable(); -	} else -		local_irq_enable(); +	} +	local_irq_enable(); +	put_cpu();  }  /* @@ -1196,10 +1360,9 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)  	runqueue_t *rq, *this_rq;  	rq = task_rq_lock(p, &flags); -	cpu = task_cpu(p); -	this_cpu = smp_processor_id(); -  	BUG_ON(p->state != TASK_RUNNING); +	this_cpu = smp_processor_id(); +	cpu = task_cpu(p);  	/*  	 * We decrease the sleep average of forking parents @@ -1296,22 +1459,40 @@ void fastcall sched_exit(task_t * p)  }  /** + * prepare_task_switch - prepare to switch tasks + * @rq: the runqueue preparing to switch + * @next: the task we are going to switch to. + * + * This is called with the rq lock held and interrupts off. It must + * be paired with a subsequent finish_task_switch after the context + * switch. + * + * prepare_task_switch sets up locking and calls architecture specific + * hooks. + */ +static inline void prepare_task_switch(runqueue_t *rq, task_t *next) +{ +	prepare_lock_switch(rq, next); +	prepare_arch_switch(next); +} + +/**   * finish_task_switch - clean up after a task-switch   * @prev: the thread we just switched away from.   * - * We enter this with the runqueue still locked, and finish_arch_switch() - * will unlock it along with doing any other architecture-specific cleanup - * actions. + * finish_task_switch must be called after the context switch, paired + * with a prepare_task_switch call before the context switch. + * finish_task_switch will reconcile locking set up by prepare_task_switch, + * and do any other architecture-specific cleanup actions.   *   * Note that we may have delayed dropping an mm in context_switch(). If   * so, we finish that here outside of the runqueue lock.  (Doing it   * with the lock held can cause deadlocks; see schedule() for   * details.)   */ -static inline void finish_task_switch(task_t *prev) +static inline void finish_task_switch(runqueue_t *rq, task_t *prev)  	__releases(rq->lock)  { -	runqueue_t *rq = this_rq();  	struct mm_struct *mm = rq->prev_mm;  	unsigned long prev_task_flags; @@ -1329,7 +1510,8 @@ static inline void finish_task_switch(task_t *prev)  	 *		Manfred Spraul <manfred@colorfullife.com>  	 */  	prev_task_flags = prev->flags; -	finish_arch_switch(rq, prev); +	finish_arch_switch(prev); +	finish_lock_switch(rq, prev);  	if (mm)  		mmdrop(mm);  	if (unlikely(prev_task_flags & PF_DEAD)) @@ -1343,8 +1525,12 @@ static inline void finish_task_switch(task_t *prev)  asmlinkage void schedule_tail(task_t *prev)  	__releases(rq->lock)  { -	finish_task_switch(prev); - +	runqueue_t *rq = this_rq(); +	finish_task_switch(rq, prev); +#ifdef __ARCH_WANT_UNLOCKED_CTXSW +	/* In this case, finish_task_switch does not reenable preemption */ +	preempt_enable(); +#endif  	if (current->set_child_tid)  		put_user(current->pid, current->set_child_tid);  } @@ -1494,51 +1680,6 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)  }  /* - * find_idlest_cpu - find the least busy runqueue. - */ -static int find_idlest_cpu(struct task_struct *p, int this_cpu, -			   struct sched_domain *sd) -{ -	unsigned long load, min_load, this_load; -	int i, min_cpu; -	cpumask_t mask; - -	min_cpu = UINT_MAX; -	min_load = ULONG_MAX; - -	cpus_and(mask, sd->span, p->cpus_allowed); - -	for_each_cpu_mask(i, mask) { -		load = target_load(i); - -		if (load < min_load) { -			min_cpu = i; -			min_load = load; - -			/* break out early on an idle CPU: */ -			if (!min_load) -				break; -		} -	} - -	/* add +1 to account for the new task */ -	this_load = source_load(this_cpu) + SCHED_LOAD_SCALE; - -	/* -	 * Would with the addition of the new task to the -	 * current CPU there be an imbalance between this -	 * CPU and the idlest CPU? -	 * -	 * Use half of the balancing threshold - new-context is -	 * a good opportunity to balance. -	 */ -	if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100) -		return min_cpu; - -	return this_cpu; -} - -/*   * If dest_cpu is allowed for this process, migrate the task to it.   * This is accomplished by forcing the cpu_allowed mask to only   * allow dest_cpu, which will force the cpu onto dest_cpu.  Then @@ -1571,37 +1712,16 @@ out:  }  /* - * sched_exec(): find the highest-level, exec-balance-capable - * domain and try to migrate the task to the least loaded CPU. - * - * execve() is a valuable balancing opportunity, because at this point - * the task has the smallest effective memory and cache footprint. + * sched_exec - execve() is a valuable balancing opportunity, because at + * this point the task has the smallest effective memory and cache footprint.   */  void sched_exec(void)  { -	struct sched_domain *tmp, *sd = NULL;  	int new_cpu, this_cpu = get_cpu(); - -	/* Prefer the current CPU if there's only this task running */ -	if (this_rq()->nr_running <= 1) -		goto out; - -	for_each_domain(this_cpu, tmp) -		if (tmp->flags & SD_BALANCE_EXEC) -			sd = tmp; - -	if (sd) { -		schedstat_inc(sd, sbe_attempts); -		new_cpu = find_idlest_cpu(current, this_cpu, sd); -		if (new_cpu != this_cpu) { -			schedstat_inc(sd, sbe_pushed); -			put_cpu(); -			sched_migrate_task(current, new_cpu); -			return; -		} -	} -out: +	new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);  	put_cpu(); +	if (new_cpu != this_cpu) +		sched_migrate_task(current, new_cpu);  }  /* @@ -1632,7 +1752,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,   */  static inline  int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, -		     struct sched_domain *sd, enum idle_type idle) +	     struct sched_domain *sd, enum idle_type idle, int *all_pinned)  {  	/*  	 * We do not migrate tasks that are: @@ -1640,23 +1760,24 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,  	 * 2) cannot be migrated to this CPU due to cpus_allowed, or  	 * 3) are cache-hot on their current CPU.  	 */ -	if (task_running(rq, p)) -		return 0;  	if (!cpu_isset(this_cpu, p->cpus_allowed))  		return 0; +	*all_pinned = 0; + +	if (task_running(rq, p)) +		return 0;  	/*  	 * Aggressive migration if: -	 * 1) the [whole] cpu is idle, or +	 * 1) task is cache cold, or  	 * 2) too many balance attempts have failed.  	 */ -	if (cpu_and_siblings_are_idle(this_cpu) || \ -			sd->nr_balance_failed > sd->cache_nice_tries) +	if (sd->nr_balance_failed > sd->cache_nice_tries)  		return 1;  	if (task_hot(p, rq->timestamp_last_tick, sd)) -			return 0; +		return 0;  	return 1;  } @@ -1669,16 +1790,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,   */  static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,  		      unsigned long max_nr_move, struct sched_domain *sd, -		      enum idle_type idle) +		      enum idle_type idle, int *all_pinned)  {  	prio_array_t *array, *dst_array;  	struct list_head *head, *curr; -	int idx, pulled = 0; +	int idx, pulled = 0, pinned = 0;  	task_t *tmp; -	if (max_nr_move <= 0 || busiest->nr_running <= 1) +	if (max_nr_move == 0)  		goto out; +	pinned = 1; +  	/*  	 * We first consider expired tasks. Those will likely not be  	 * executed in the near future, and they are most likely to @@ -1717,7 +1840,7 @@ skip_queue:  	curr = curr->prev; -	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { +	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {  		if (curr != head)  			goto skip_queue;  		idx++; @@ -1746,6 +1869,9 @@ out:  	 * inside pull_task().  	 */  	schedstat_add(sd, lb_gained[idle], pulled); + +	if (all_pinned) +		*all_pinned = pinned;  	return pulled;  } @@ -1760,8 +1886,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,  {  	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;  	unsigned long max_load, avg_load, total_load, this_load, total_pwr; +	int load_idx;  	max_load = this_load = total_load = total_pwr = 0; +	if (idle == NOT_IDLE) +		load_idx = sd->busy_idx; +	else if (idle == NEWLY_IDLE) +		load_idx = sd->newidle_idx; +	else +		load_idx = sd->idle_idx;  	do {  		unsigned long load; @@ -1776,9 +1909,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,  		for_each_cpu_mask(i, group->cpumask) {  			/* Bias balancing toward cpus of our domain */  			if (local_group) -				load = target_load(i); +				load = target_load(i, load_idx);  			else -				load = source_load(i); +				load = source_load(i, load_idx);  			avg_load += load;  		} @@ -1792,12 +1925,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,  		if (local_group) {  			this_load = avg_load;  			this = group; -			goto nextgroup;  		} else if (avg_load > max_load) {  			max_load = avg_load;  			busiest = group;  		} -nextgroup:  		group = group->next;  	} while (group != sd->groups); @@ -1870,15 +2001,9 @@ nextgroup:  	/* Get rid of the scaling factor, rounding down as we divide */  	*imbalance = *imbalance / SCHED_LOAD_SCALE; -  	return busiest;  out_balanced: -	if (busiest && (idle == NEWLY_IDLE || -			(idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) { -		*imbalance = 1; -		return busiest; -	}  	*imbalance = 0;  	return NULL; @@ -1894,7 +2019,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)  	int i;  	for_each_cpu_mask(i, group->cpumask) { -		load = source_load(i); +		load = source_load(i, 0);  		if (load > max_load) {  			max_load = load; @@ -1906,6 +2031,12 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)  }  /* + * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but + * so long as it is large enough. + */ +#define MAX_PINNED_INTERVAL	512 + +/*   * Check this_cpu to ensure it is balanced within domain. Attempt to move   * tasks if there is an imbalance.   * @@ -1917,7 +2048,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,  	struct sched_group *group;  	runqueue_t *busiest;  	unsigned long imbalance; -	int nr_moved; +	int nr_moved, all_pinned = 0; +	int active_balance = 0;  	spin_lock(&this_rq->lock);  	schedstat_inc(sd, lb_cnt[idle]); @@ -1934,15 +2066,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,  		goto out_balanced;  	} -	/* -	 * This should be "impossible", but since load -	 * balancing is inherently racy and statistical, -	 * it could happen in theory. -	 */ -	if (unlikely(busiest == this_rq)) { -		WARN_ON(1); -		goto out_balanced; -	} +	BUG_ON(busiest == this_rq);  	schedstat_add(sd, lb_imbalance[idle], imbalance); @@ -1956,9 +2080,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,  		 */  		double_lock_balance(this_rq, busiest);  		nr_moved = move_tasks(this_rq, this_cpu, busiest, -						imbalance, sd, idle); +						imbalance, sd, idle, +						&all_pinned);  		spin_unlock(&busiest->lock); + +		/* All tasks on this runqueue were pinned by CPU affinity */ +		if (unlikely(all_pinned)) +			goto out_balanced;  	} +  	spin_unlock(&this_rq->lock);  	if (!nr_moved) { @@ -1966,36 +2096,38 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,  		sd->nr_balance_failed++;  		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { -			int wake = 0;  			spin_lock(&busiest->lock);  			if (!busiest->active_balance) {  				busiest->active_balance = 1;  				busiest->push_cpu = this_cpu; -				wake = 1; +				active_balance = 1;  			}  			spin_unlock(&busiest->lock); -			if (wake) +			if (active_balance)  				wake_up_process(busiest->migration_thread);  			/*  			 * We've kicked active balancing, reset the failure  			 * counter.  			 */ -			sd->nr_balance_failed = sd->cache_nice_tries; +			sd->nr_balance_failed = sd->cache_nice_tries+1;  		} - -		/* -		 * We were unbalanced, but unsuccessful in move_tasks(), -		 * so bump the balance_interval to lessen the lock contention. -		 */ -		if (sd->balance_interval < sd->max_interval) -			sd->balance_interval++; -	} else { +	} else  		sd->nr_balance_failed = 0; +	if (likely(!active_balance)) {  		/* We were unbalanced, so reset the balancing interval */  		sd->balance_interval = sd->min_interval; +	} else { +		/* +		 * If we've begun active balancing, start to back off. This +		 * case may not be covered by the all_pinned logic if there +		 * is only 1 task on the busy runqueue (because we don't call +		 * move_tasks). +		 */ +		if (sd->balance_interval < sd->max_interval) +			sd->balance_interval *= 2;  	}  	return nr_moved; @@ -2005,8 +2137,10 @@ out_balanced:  	schedstat_inc(sd, lb_balanced[idle]); +	sd->nr_balance_failed = 0;  	/* tune up the balancing interval */ -	if (sd->balance_interval < sd->max_interval) +	if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || +			(sd->balance_interval < sd->max_interval))  		sd->balance_interval *= 2;  	return 0; @@ -2030,31 +2164,36 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,  	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);  	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);  	if (!group) { -		schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);  		schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); -		goto out; +		goto out_balanced;  	}  	busiest = find_busiest_queue(group); -	if (!busiest || busiest == this_rq) { -		schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); +	if (!busiest) {  		schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); -		goto out; +		goto out_balanced;  	} +	BUG_ON(busiest == this_rq); +  	/* Attempt to move tasks */  	double_lock_balance(this_rq, busiest);  	schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);  	nr_moved = move_tasks(this_rq, this_cpu, busiest, -					imbalance, sd, NEWLY_IDLE); +					imbalance, sd, NEWLY_IDLE, NULL);  	if (!nr_moved)  		schedstat_inc(sd, lb_failed[NEWLY_IDLE]); +	else +		sd->nr_balance_failed = 0;  	spin_unlock(&busiest->lock); - -out:  	return nr_moved; + +out_balanced: +	schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); +	sd->nr_balance_failed = 0; +	return 0;  }  /* @@ -2086,56 +2225,42 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq)  static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)  {  	struct sched_domain *sd; -	struct sched_group *cpu_group;  	runqueue_t *target_rq; -	cpumask_t visited_cpus; -	int cpu; +	int target_cpu = busiest_rq->push_cpu; + +	if (busiest_rq->nr_running <= 1) +		/* no task to move */ +		return; + +	target_rq = cpu_rq(target_cpu);  	/* -	 * Search for suitable CPUs to push tasks to in successively higher -	 * domains with SD_LOAD_BALANCE set. +	 * This condition is "impossible", if it occurs +	 * we need to fix it.  Originally reported by +	 * Bjorn Helgaas on a 128-cpu setup.  	 */ -	visited_cpus = CPU_MASK_NONE; -	for_each_domain(busiest_cpu, sd) { -		if (!(sd->flags & SD_LOAD_BALANCE)) -			/* no more domains to search */ -			break; +	BUG_ON(busiest_rq == target_rq); -		schedstat_inc(sd, alb_cnt); +	/* move a task from busiest_rq to target_rq */ +	double_lock_balance(busiest_rq, target_rq); -		cpu_group = sd->groups; -		do { -			for_each_cpu_mask(cpu, cpu_group->cpumask) { -				if (busiest_rq->nr_running <= 1) -					/* no more tasks left to move */ -					return; -				if (cpu_isset(cpu, visited_cpus)) -					continue; -				cpu_set(cpu, visited_cpus); -				if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu) -					continue; - -				target_rq = cpu_rq(cpu); -				/* -				 * This condition is "impossible", if it occurs -				 * we need to fix it.  Originally reported by -				 * Bjorn Helgaas on a 128-cpu setup. -				 */ -				BUG_ON(busiest_rq == target_rq); - -				/* move a task from busiest_rq to target_rq */ -				double_lock_balance(busiest_rq, target_rq); -				if (move_tasks(target_rq, cpu, busiest_rq, -						1, sd, SCHED_IDLE)) { -					schedstat_inc(sd, alb_pushed); -				} else { -					schedstat_inc(sd, alb_failed); -				} -				spin_unlock(&target_rq->lock); -			} -			cpu_group = cpu_group->next; -		} while (cpu_group != sd->groups); -	} +	/* Search for an sd spanning us and the target CPU. */ +	for_each_domain(target_cpu, sd) +		if ((sd->flags & SD_LOAD_BALANCE) && +			cpu_isset(busiest_cpu, sd->span)) +				break; + +	if (unlikely(sd == NULL)) +		goto out; + +	schedstat_inc(sd, alb_cnt); + +	if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) +		schedstat_inc(sd, alb_pushed); +	else +		schedstat_inc(sd, alb_failed); +out: +	spin_unlock(&target_rq->lock);  }  /* @@ -2156,18 +2281,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,  	unsigned long old_load, this_load;  	unsigned long j = jiffies + CPU_OFFSET(this_cpu);  	struct sched_domain *sd; +	int i; -	/* Update our load */ -	old_load = this_rq->cpu_load;  	this_load = this_rq->nr_running * SCHED_LOAD_SCALE; -	/* -	 * Round up the averaging division if load is increasing. This -	 * prevents us from getting stuck on 9 if the load is 10, for -	 * example. -	 */ -	if (this_load > old_load) -		old_load++; -	this_rq->cpu_load = (old_load + this_load) / 2; +	/* Update our load */ +	for (i = 0; i < 3; i++) { +		unsigned long new_load = this_load; +		int scale = 1 << i; +		old_load = this_rq->cpu_load[i]; +		/* +		 * Round up the averaging division if load is increasing. This +		 * prevents us from getting stuck on 9 if the load is 10, for +		 * example. +		 */ +		if (new_load > old_load) +			new_load += scale-1; +		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; +	}  	for_each_domain(this_cpu, sd) {  		unsigned long interval; @@ -2447,11 +2577,15 @@ out:  #ifdef CONFIG_SCHED_SMT  static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)  { -	struct sched_domain *sd = this_rq->sd; +	struct sched_domain *tmp, *sd = NULL;  	cpumask_t sibling_map;  	int i; -	if (!(sd->flags & SD_SHARE_CPUPOWER)) +	for_each_domain(this_cpu, tmp) +		if (tmp->flags & SD_SHARE_CPUPOWER) +			sd = tmp; + +	if (!sd)  		return;  	/* @@ -2492,13 +2626,17 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)  static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)  { -	struct sched_domain *sd = this_rq->sd; +	struct sched_domain *tmp, *sd = NULL;  	cpumask_t sibling_map;  	prio_array_t *array;  	int ret = 0, i;  	task_t *p; -	if (!(sd->flags & SD_SHARE_CPUPOWER)) +	for_each_domain(this_cpu, tmp) +		if (tmp->flags & SD_SHARE_CPUPOWER) +			sd = tmp; + +	if (!sd)  		return 0;  	/* @@ -2613,7 +2751,7 @@ asmlinkage void __sched schedule(void)  	struct list_head *queue;  	unsigned long long now;  	unsigned long run_time; -	int cpu, idx; +	int cpu, idx, new_prio;  	/*  	 * Test if we are atomic.  Since do_exit() needs to call into @@ -2735,9 +2873,14 @@ go_idle:  			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;  		array = next->array; -		dequeue_task(next, array); -		recalc_task_prio(next, next->timestamp + delta); -		enqueue_task(next, array); +		new_prio = recalc_task_prio(next, next->timestamp + delta); + +		if (unlikely(next->prio != new_prio)) { +			dequeue_task(next, array); +			next->prio = new_prio; +			enqueue_task(next, array); +		} else +			requeue_task(next, array);  	}  	next->activated = 0;  switch_tasks: @@ -2761,11 +2904,15 @@ switch_tasks:  		rq->curr = next;  		++*switch_count; -		prepare_arch_switch(rq, next); +		prepare_task_switch(rq, next);  		prev = context_switch(rq, prev, next);  		barrier(); - -		finish_task_switch(prev); +		/* +		 * this_rq must be evaluated again because prev may have moved +		 * CPUs since it called schedule(), thus the 'rq' on its stack +		 * frame will be invalid. +		 */ +		finish_task_switch(this_rq(), prev);  	} else  		spin_unlock_irq(&rq->lock); @@ -3384,13 +3531,24 @@ recheck:  	if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))  		return -EINVAL; -	if ((policy == SCHED_FIFO || policy == SCHED_RR) && -	    param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur && -	    !capable(CAP_SYS_NICE)) -		return -EPERM; -	if ((current->euid != p->euid) && (current->euid != p->uid) && -	    !capable(CAP_SYS_NICE)) -		return -EPERM; +	/* +	 * Allow unprivileged RT tasks to decrease priority: +	 */ +	if (!capable(CAP_SYS_NICE)) { +		/* can't change policy */ +		if (policy != p->policy) +			return -EPERM; +		/* can't increase priority */ +		if (policy != SCHED_NORMAL && +		    param->sched_priority > p->rt_priority && +		    param->sched_priority > +				p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) +			return -EPERM; +		/* can't change other user's priorities */ +		if ((current->euid != p->euid) && +		    (current->euid != p->uid)) +			return -EPERM; +	}  	retval = security_task_setscheduler(p, policy, param);  	if (retval) @@ -4030,6 +4188,9 @@ void __devinit init_idle(task_t *idle, int cpu)  	spin_lock_irqsave(&rq->lock, flags);  	rq->curr = rq->idle = idle; +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) +	idle->oncpu = 1; +#endif  	set_tsk_need_resched(idle);  	spin_unlock_irqrestore(&rq->lock, flags); @@ -4174,8 +4335,7 @@ static int migration_thread(void * data)  		struct list_head *head;  		migration_req_t *req; -		if (current->flags & PF_FREEZE) -			refrigerator(PF_FREEZE); +		try_to_freeze();  		spin_lock_irq(&rq->lock); @@ -4200,17 +4360,9 @@ static int migration_thread(void * data)  		req = list_entry(head->next, migration_req_t, list);  		list_del_init(head->next); -		if (req->type == REQ_MOVE_TASK) { -			spin_unlock(&rq->lock); -			__migrate_task(req->task, cpu, req->dest_cpu); -			local_irq_enable(); -		} else if (req->type == REQ_SET_DOMAIN) { -			rq->sd = req->sd; -			spin_unlock_irq(&rq->lock); -		} else { -			spin_unlock_irq(&rq->lock); -			WARN_ON(1); -		} +		spin_unlock(&rq->lock); +		__migrate_task(req->task, cpu, req->dest_cpu); +		local_irq_enable();  		complete(&req->done);  	} @@ -4441,7 +4593,6 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,  			migration_req_t *req;  			req = list_entry(rq->migration_queue.next,  					 migration_req_t, list); -			BUG_ON(req->type != REQ_MOVE_TASK);  			list_del_init(&req->list);  			complete(&req->done);  		} @@ -4472,12 +4623,17 @@ int __init migration_init(void)  #endif  #ifdef CONFIG_SMP -#define SCHED_DOMAIN_DEBUG +#undef SCHED_DOMAIN_DEBUG  #ifdef SCHED_DOMAIN_DEBUG  static void sched_domain_debug(struct sched_domain *sd, int cpu)  {  	int level = 0; +	if (!sd) { +		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); +		return; +	} +  	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);  	do { @@ -4560,37 +4716,81 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)  #define sched_domain_debug(sd, cpu) {}  #endif +static int sd_degenerate(struct sched_domain *sd) +{ +	if (cpus_weight(sd->span) == 1) +		return 1; + +	/* Following flags need at least 2 groups */ +	if (sd->flags & (SD_LOAD_BALANCE | +			 SD_BALANCE_NEWIDLE | +			 SD_BALANCE_FORK | +			 SD_BALANCE_EXEC)) { +		if (sd->groups != sd->groups->next) +			return 0; +	} + +	/* Following flags don't use groups */ +	if (sd->flags & (SD_WAKE_IDLE | +			 SD_WAKE_AFFINE | +			 SD_WAKE_BALANCE)) +		return 0; + +	return 1; +} + +static int sd_parent_degenerate(struct sched_domain *sd, +						struct sched_domain *parent) +{ +	unsigned long cflags = sd->flags, pflags = parent->flags; + +	if (sd_degenerate(parent)) +		return 1; + +	if (!cpus_equal(sd->span, parent->span)) +		return 0; + +	/* Does parent contain flags not in child? */ +	/* WAKE_BALANCE is a subset of WAKE_AFFINE */ +	if (cflags & SD_WAKE_AFFINE) +		pflags &= ~SD_WAKE_BALANCE; +	/* Flags needing groups don't count if only 1 group in parent */ +	if (parent->groups == parent->groups->next) { +		pflags &= ~(SD_LOAD_BALANCE | +				SD_BALANCE_NEWIDLE | +				SD_BALANCE_FORK | +				SD_BALANCE_EXEC); +	} +	if (~cflags & pflags) +		return 0; + +	return 1; +} +  /*   * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must   * hold the hotplug lock.   */ -void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) +void cpu_attach_domain(struct sched_domain *sd, int cpu)  { -	migration_req_t req; -	unsigned long flags;  	runqueue_t *rq = cpu_rq(cpu); -	int local = 1; - -	sched_domain_debug(sd, cpu); +	struct sched_domain *tmp; -	spin_lock_irqsave(&rq->lock, flags); - -	if (cpu == smp_processor_id() || !cpu_online(cpu)) { -		rq->sd = sd; -	} else { -		init_completion(&req.done); -		req.type = REQ_SET_DOMAIN; -		req.sd = sd; -		list_add(&req.list, &rq->migration_queue); -		local = 0; +	/* Remove the sched domains which do not contribute to scheduling. */ +	for (tmp = sd; tmp; tmp = tmp->parent) { +		struct sched_domain *parent = tmp->parent; +		if (!parent) +			break; +		if (sd_parent_degenerate(tmp, parent)) +			tmp->parent = parent->parent;  	} -	spin_unlock_irqrestore(&rq->lock, flags); +	if (sd && sd_degenerate(sd)) +		sd = sd->parent; -	if (!local) { -		wake_up_process(rq->migration_thread); -		wait_for_completion(&req.done); -	} +	sched_domain_debug(sd, cpu); + +	rcu_assign_pointer(rq->sd, sd);  }  /* cpus with isolated domains */ @@ -4622,7 +4822,7 @@ __setup ("isolcpus=", isolated_cpu_setup);   * covered by the given span, and will set each group's ->cpumask correctly,   * and ->cpu_power to 0.   */ -void __devinit init_sched_build_groups(struct sched_group groups[], +void init_sched_build_groups(struct sched_group groups[],  			cpumask_t span, int (*group_fn)(int cpu))  {  	struct sched_group *first = NULL, *last = NULL; @@ -4658,13 +4858,14 @@ void __devinit init_sched_build_groups(struct sched_group groups[],  #ifdef ARCH_HAS_SCHED_DOMAIN -extern void __devinit arch_init_sched_domains(void); -extern void __devinit arch_destroy_sched_domains(void); +extern void build_sched_domains(const cpumask_t *cpu_map); +extern void arch_init_sched_domains(const cpumask_t *cpu_map); +extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);  #else  #ifdef CONFIG_SCHED_SMT  static DEFINE_PER_CPU(struct sched_domain, cpu_domains);  static struct sched_group sched_group_cpus[NR_CPUS]; -static int __devinit cpu_to_cpu_group(int cpu) +static int cpu_to_cpu_group(int cpu)  {  	return cpu;  } @@ -4672,7 +4873,7 @@ static int __devinit cpu_to_cpu_group(int cpu)  static DEFINE_PER_CPU(struct sched_domain, phys_domains);  static struct sched_group sched_group_phys[NR_CPUS]; -static int __devinit cpu_to_phys_group(int cpu) +static int cpu_to_phys_group(int cpu)  {  #ifdef CONFIG_SCHED_SMT  	return first_cpu(cpu_sibling_map[cpu]); @@ -4685,7 +4886,7 @@ static int __devinit cpu_to_phys_group(int cpu)  static DEFINE_PER_CPU(struct sched_domain, node_domains);  static struct sched_group sched_group_nodes[MAX_NUMNODES]; -static int __devinit cpu_to_node_group(int cpu) +static int cpu_to_node_group(int cpu)  {  	return cpu_to_node(cpu);  } @@ -4716,39 +4917,28 @@ static void check_sibling_maps(void)  #endif  /* - * Set up scheduler domains and groups.  Callers must hold the hotplug lock. + * Build sched domains for a given set of cpus and attach the sched domains + * to the individual cpus   */ -static void __devinit arch_init_sched_domains(void) +static void build_sched_domains(const cpumask_t *cpu_map)  {  	int i; -	cpumask_t cpu_default_map; - -#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) -	check_sibling_maps(); -#endif -	/* -	 * Setup mask for cpus without special case scheduling requirements. -	 * For now this just excludes isolated cpus, but could be used to -	 * exclude other special cases in the future. -	 */ -	cpus_complement(cpu_default_map, cpu_isolated_map); -	cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);  	/* -	 * Set up domains. Isolated domains just stay on the dummy domain. +	 * Set up domains for cpus specified by the cpu_map.  	 */ -	for_each_cpu_mask(i, cpu_default_map) { +	for_each_cpu_mask(i, *cpu_map) {  		int group;  		struct sched_domain *sd = NULL, *p;  		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); -		cpus_and(nodemask, nodemask, cpu_default_map); +		cpus_and(nodemask, nodemask, *cpu_map);  #ifdef CONFIG_NUMA  		sd = &per_cpu(node_domains, i);  		group = cpu_to_node_group(i);  		*sd = SD_NODE_INIT; -		sd->span = cpu_default_map; +		sd->span = *cpu_map;  		sd->groups = &sched_group_nodes[group];  #endif @@ -4766,7 +4956,7 @@ static void __devinit arch_init_sched_domains(void)  		group = cpu_to_cpu_group(i);  		*sd = SD_SIBLING_INIT;  		sd->span = cpu_sibling_map[i]; -		cpus_and(sd->span, sd->span, cpu_default_map); +		cpus_and(sd->span, sd->span, *cpu_map);  		sd->parent = p;  		sd->groups = &sched_group_cpus[group];  #endif @@ -4776,7 +4966,7 @@ static void __devinit arch_init_sched_domains(void)  	/* Set up CPU (sibling) groups */  	for_each_online_cpu(i) {  		cpumask_t this_sibling_map = cpu_sibling_map[i]; -		cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); +		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);  		if (i != first_cpu(this_sibling_map))  			continue; @@ -4789,7 +4979,7 @@ static void __devinit arch_init_sched_domains(void)  	for (i = 0; i < MAX_NUMNODES; i++) {  		cpumask_t nodemask = node_to_cpumask(i); -		cpus_and(nodemask, nodemask, cpu_default_map); +		cpus_and(nodemask, nodemask, *cpu_map);  		if (cpus_empty(nodemask))  			continue; @@ -4799,12 +4989,12 @@ static void __devinit arch_init_sched_domains(void)  #ifdef CONFIG_NUMA  	/* Set up node groups */ -	init_sched_build_groups(sched_group_nodes, cpu_default_map, +	init_sched_build_groups(sched_group_nodes, *cpu_map,  					&cpu_to_node_group);  #endif  	/* Calculate CPU power for physical packages and nodes */ -	for_each_cpu_mask(i, cpu_default_map) { +	for_each_cpu_mask(i, *cpu_map) {  		int power;  		struct sched_domain *sd;  #ifdef CONFIG_SCHED_SMT @@ -4828,7 +5018,7 @@ static void __devinit arch_init_sched_domains(void)  	}  	/* Attach the domains */ -	for_each_online_cpu(i) { +	for_each_cpu_mask(i, *cpu_map) {  		struct sched_domain *sd;  #ifdef CONFIG_SCHED_SMT  		sd = &per_cpu(cpu_domains, i); @@ -4838,41 +5028,85 @@ static void __devinit arch_init_sched_domains(void)  		cpu_attach_domain(sd, i);  	}  } +/* + * Set up scheduler domains and groups.  Callers must hold the hotplug lock. + */ +static void arch_init_sched_domains(cpumask_t *cpu_map) +{ +	cpumask_t cpu_default_map; + +#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) +	check_sibling_maps(); +#endif +	/* +	 * Setup mask for cpus without special case scheduling requirements. +	 * For now this just excludes isolated cpus, but could be used to +	 * exclude other special cases in the future. +	 */ +	cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); + +	build_sched_domains(&cpu_default_map); +} -#ifdef CONFIG_HOTPLUG_CPU -static void __devinit arch_destroy_sched_domains(void) +static void arch_destroy_sched_domains(const cpumask_t *cpu_map)  {  	/* Do nothing: everything is statically allocated. */  } -#endif  #endif /* ARCH_HAS_SCHED_DOMAIN */  /* - * Initial dummy domain for early boot and for hotplug cpu. Being static, - * it is initialized to zero, so all balancing flags are cleared which is - * what we want. + * Detach sched domains from a group of cpus specified in cpu_map + * These cpus will now be attached to the NULL domain   */ -static struct sched_domain sched_domain_dummy; +static inline void detach_destroy_domains(const cpumask_t *cpu_map) +{ +	int i; + +	for_each_cpu_mask(i, *cpu_map) +		cpu_attach_domain(NULL, i); +	synchronize_sched(); +	arch_destroy_sched_domains(cpu_map); +} + +/* + * Partition sched domains as specified by the cpumasks below. + * This attaches all cpus from the cpumasks to the NULL domain, + * waits for a RCU quiescent period, recalculates sched + * domain information and then attaches them back to the + * correct sched domains + * Call with hotplug lock held + */ +void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) +{ +	cpumask_t change_map; + +	cpus_and(*partition1, *partition1, cpu_online_map); +	cpus_and(*partition2, *partition2, cpu_online_map); +	cpus_or(change_map, *partition1, *partition2); + +	/* Detach sched domains from all of the affected cpus */ +	detach_destroy_domains(&change_map); +	if (!cpus_empty(*partition1)) +		build_sched_domains(partition1); +	if (!cpus_empty(*partition2)) +		build_sched_domains(partition2); +}  #ifdef CONFIG_HOTPLUG_CPU  /*   * Force a reinitialization of the sched domains hierarchy.  The domains   * and groups cannot be updated in place without racing with the balancing - * code, so we temporarily attach all running cpus to a "dummy" domain + * code, so we temporarily attach all running cpus to the NULL domain   * which will prevent rebalancing while the sched domains are recalculated.   */  static int update_sched_domains(struct notifier_block *nfb,  				unsigned long action, void *hcpu)  { -	int i; -  	switch (action) {  	case CPU_UP_PREPARE:  	case CPU_DOWN_PREPARE: -		for_each_online_cpu(i) -			cpu_attach_domain(&sched_domain_dummy, i); -		arch_destroy_sched_domains(); +		detach_destroy_domains(&cpu_online_map);  		return NOTIFY_OK;  	case CPU_UP_CANCELED: @@ -4888,7 +5122,7 @@ static int update_sched_domains(struct notifier_block *nfb,  	}  	/* The hotplug lock is already held by cpu_up/cpu_down */ -	arch_init_sched_domains(); +	arch_init_sched_domains(&cpu_online_map);  	return NOTIFY_OK;  } @@ -4897,7 +5131,7 @@ static int update_sched_domains(struct notifier_block *nfb,  void __init sched_init_smp(void)  {  	lock_cpu_hotplug(); -	arch_init_sched_domains(); +	arch_init_sched_domains(&cpu_online_map);  	unlock_cpu_hotplug();  	/* XXX: Theoretical race here - CPU may be hotplugged now */  	hotcpu_notifier(update_sched_domains, 0); @@ -4927,13 +5161,15 @@ void __init sched_init(void)  		rq = cpu_rq(i);  		spin_lock_init(&rq->lock); +		rq->nr_running = 0;  		rq->active = rq->arrays;  		rq->expired = rq->arrays + 1;  		rq->best_expired_prio = MAX_PRIO;  #ifdef CONFIG_SMP -		rq->sd = &sched_domain_dummy; -		rq->cpu_load = 0; +		rq->sd = NULL; +		for (j = 1; j < 3; j++) +			rq->cpu_load[j] = 0;  		rq->active_balance = 0;  		rq->push_cpu = 0;  		rq->migration_thread = NULL; | 
