summaryrefslogtreecommitdiff
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c94
1 files changed, 68 insertions, 26 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6559d197e08a..274c747a01ce 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2657,6 +2657,18 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
if (tg_weight)
shares /= tg_weight;
+ /*
+ * MIN_SHARES has to be unscaled here to support per-CPU partitioning
+ * of a group with small tg->shares value. It is a floor value which is
+ * assigned as a minimum load.weight to the sched_entity representing
+ * the group on a CPU.
+ *
+ * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
+ * on an 8-core system with 8 tasks each runnable on one CPU shares has
+ * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
+ * case no task is runnable on a CPU MIN_SHARES=2 should be returned
+ * instead of 0.
+ */
if (shares < MIN_SHARES)
shares = MIN_SHARES;
if (shares > tg->shares)
@@ -2689,16 +2701,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
+static void update_cfs_shares(struct sched_entity *se)
{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
struct task_group *tg;
- struct sched_entity *se;
long shares;
- tg = cfs_rq->tg;
- se = tg->se[cpu_of(rq_of(cfs_rq))];
- if (!se || throttled_hierarchy(cfs_rq))
+ if (!cfs_rq)
+ return;
+
+ if (throttled_hierarchy(cfs_rq))
return;
+
+ tg = cfs_rq->tg;
+
#ifndef CONFIG_SMP
if (likely(se->load.weight == tg->shares))
return;
@@ -2707,8 +2723,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
reweight_entity(cfs_rq_of(se), se, shares);
}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+static inline void update_cfs_shares(struct sched_entity *se)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -3424,7 +3441,7 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
return cfs_rq->avg.load_avg;
}
-static int idle_balance(struct rq *this_rq);
+static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
#else /* CONFIG_SMP */
@@ -3453,7 +3470,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline int idle_balance(struct rq *rq)
+static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
{
return 0;
}
@@ -3582,10 +3599,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (renorm && !curr)
se->vruntime += cfs_rq->min_vruntime;
+ /*
+ * When enqueuing a sched_entity, we must:
+ * - Update loads to have both entity and cfs_rq synced with now.
+ * - Add its load to cfs_rq->runnable_avg
+ * - For group_entity, update its weight to reflect the new share of
+ * its group cfs_rq
+ * - Add its new weight to cfs_rq->load.weight
+ */
update_load_avg(se, UPDATE_TG);
enqueue_entity_load_avg(cfs_rq, se);
+ update_cfs_shares(se);
account_entity_enqueue(cfs_rq, se);
- update_cfs_shares(cfs_rq);
if (flags & ENQUEUE_WAKEUP)
place_entity(cfs_rq, se, 0);
@@ -3657,6 +3682,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+
+ /*
+ * When dequeuing a sched_entity, we must:
+ * - Update loads to have both entity and cfs_rq synced with now.
+ * - Substract its load from the cfs_rq->runnable_avg.
+ * - Substract its previous weight from cfs_rq->load.weight.
+ * - For group entity, update its weight to reflect the new share
+ * of its group cfs_rq.
+ */
update_load_avg(se, UPDATE_TG);
dequeue_entity_load_avg(cfs_rq, se);
@@ -3681,7 +3715,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
- update_cfs_shares(cfs_rq);
+ update_cfs_shares(se);
/*
* Now advance min_vruntime if @se was the entity holding it back,
@@ -3864,7 +3898,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
* Ensure that runnable average is periodically updated.
*/
update_load_avg(curr, UPDATE_TG);
- update_cfs_shares(cfs_rq);
+ update_cfs_shares(curr);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -4761,7 +4795,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
break;
update_load_avg(se, UPDATE_TG);
- update_cfs_shares(cfs_rq);
+ update_cfs_shares(se);
}
if (!se)
@@ -4820,7 +4854,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
break;
update_load_avg(se, UPDATE_TG);
- update_cfs_shares(cfs_rq);
+ update_cfs_shares(se);
}
if (!se)
@@ -6213,7 +6247,7 @@ preempt:
}
static struct task_struct *
-pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
+pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
@@ -6320,15 +6354,8 @@ simple:
return p;
idle:
- /*
- * This is OK, because current is on_cpu, which avoids it being picked
- * for load-balance and preemption/IRQs are still disabled avoiding
- * further scheduler activity on it and we're being very careful to
- * re-start the picking loop.
- */
- lockdep_unpin_lock(&rq->lock, cookie);
- new_tasks = idle_balance(rq);
- lockdep_repin_lock(&rq->lock, cookie);
+ new_tasks = idle_balance(rq, rf);
+
/*
* Because idle_balance() releases (and re-acquires) rq->lock, it is
* possible for any higher priority task to appear. In that case we
@@ -8077,6 +8104,7 @@ redo:
more_balance:
raw_spin_lock_irqsave(&busiest->lock, flags);
+ update_rq_clock(busiest);
/*
* cur_ld_moved - load moved in current iteration
@@ -8297,7 +8325,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*/
-static int idle_balance(struct rq *this_rq)
+static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
@@ -8311,6 +8339,14 @@ static int idle_balance(struct rq *this_rq)
*/
this_rq->idle_stamp = rq_clock(this_rq);
+ /*
+ * This is OK, because current is on_cpu, which avoids it being picked
+ * for load-balance and preemption/IRQs are still disabled avoiding
+ * further scheduler activity on it and we're being very careful to
+ * re-start the picking loop.
+ */
+ rq_unpin_lock(this_rq, rf);
+
if (this_rq->avg_idle < sysctl_sched_migration_cost ||
!this_rq->rd->overload) {
rcu_read_lock();
@@ -8388,6 +8424,8 @@ out:
if (pulled_task)
this_rq->idle_stamp = 0;
+ rq_repin_lock(this_rq, rf);
+
return pulled_task;
}
@@ -8443,6 +8481,7 @@ static int active_load_balance_cpu_stop(void *data)
};
schedstat_inc(sd->alb_count);
+ update_rq_clock(busiest_rq);
p = detach_one_task(&env);
if (p) {
@@ -9264,6 +9303,7 @@ void online_fair_sched_group(struct task_group *tg)
se = tg->se[i];
raw_spin_lock_irq(&rq->lock);
+ update_rq_clock(rq);
attach_entity_cfs_rq(se);
sync_throttle(tg, i);
raw_spin_unlock_irq(&rq->lock);
@@ -9356,8 +9396,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
/* Possible calls to update_curr() need rq clock */
update_rq_clock(rq);
- for_each_sched_entity(se)
- update_cfs_shares(group_cfs_rq(se));
+ for_each_sched_entity(se) {
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
+ }
raw_spin_unlock_irqrestore(&rq->lock, flags);
}