summaryrefslogtreecommitdiff
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
authorJosh Don <joshdon@google.com>2021-10-18 13:34:28 -0700
committerPeter Zijlstra <peterz@infradead.org>2021-11-17 14:49:00 +0100
commit4feee7d12603deca8775f9f9ae5e121093837444 (patch)
tree5b4d68bde6fbd540ec521237d75892b617241eac /kernel/sched/core.c
parent2fb75e1b642f49253d8848c9e47e8942f5366221 (diff)
sched/core: Forced idle accounting
Adds accounting for "forced idle" time, which is time where a cookie'd task forces its SMT sibling to idle, despite the presence of runnable tasks. Forced idle time is one means to measure the cost of enabling core scheduling (ie. the capacity lost due to the need to force idle). Forced idle time is attributed to the thread responsible for causing the forced idle. A few details: - Forced idle time is displayed via /proc/PID/sched. It also requires that schedstats is enabled. - Forced idle is only accounted when a sibling hyperthread is held idle despite the presence of runnable tasks. No time is charged if a sibling is idle but has no runnable tasks. - Tasks with 0 cookie are never charged forced idle. - For SMT > 2, we scale the amount of forced idle charged based on the number of forced idle siblings. Additionally, we split the time up and evenly charge it to all running tasks, as each is equally responsible for the forced idle. Signed-off-by: Josh Don <joshdon@google.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lkml.kernel.org/r/20211018203428.2025792-1-joshdon@google.com
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c82
1 files changed, 62 insertions, 20 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3c9b0fda64ac..beaa8be6241e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -144,7 +144,7 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct *
return false;
/* flip prio, so high prio is leftmost */
- if (prio_less(b, a, task_rq(a)->core->core_forceidle))
+ if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count))
return true;
return false;
@@ -181,15 +181,23 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
}
-void sched_core_dequeue(struct rq *rq, struct task_struct *p)
+void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
{
rq->core->core_task_seq++;
- if (!sched_core_enqueued(p))
- return;
+ if (sched_core_enqueued(p)) {
+ rb_erase(&p->core_node, &rq->core_tree);
+ RB_CLEAR_NODE(&p->core_node);
+ }
- rb_erase(&p->core_node, &rq->core_tree);
- RB_CLEAR_NODE(&p->core_node);
+ /*
+ * Migrating the last task off the cpu, with the cpu in forced idle
+ * state. Reschedule to create an accounting edge for forced idle,
+ * and re-examine whether the core is still in forced idle state.
+ */
+ if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
+ rq->core->core_forceidle_count && rq->curr == rq->idle)
+ resched_curr(rq);
}
/*
@@ -280,6 +288,8 @@ static void __sched_core_flip(bool enabled)
for_each_cpu(t, smt_mask)
cpu_rq(t)->core_enabled = enabled;
+ cpu_rq(cpu)->core->core_forceidle_start = 0;
+
sched_core_unlock(cpu, &flags);
cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
@@ -364,7 +374,8 @@ void sched_core_put(void)
#else /* !CONFIG_SCHED_CORE */
static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
-static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
+static inline void
+sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
#endif /* CONFIG_SCHED_CORE */
@@ -2005,7 +2016,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (sched_core_enabled(rq))
- sched_core_dequeue(rq, p);
+ sched_core_dequeue(rq, p, flags);
if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
@@ -5244,6 +5255,7 @@ void scheduler_tick(void)
if (sched_feat(LATENCY_WARN))
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
+ sched_core_tick(rq);
rq_unlock(rq, &rf);
@@ -5656,6 +5668,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *next, *p, *max = NULL;
const struct cpumask *smt_mask;
bool fi_before = false;
+ bool core_clock_updated = (rq == rq->core);
unsigned long cookie;
int i, cpu, occ = 0;
struct rq *rq_i;
@@ -5708,10 +5721,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/* reset state */
rq->core->core_cookie = 0UL;
- if (rq->core->core_forceidle) {
+ if (rq->core->core_forceidle_count) {
+ if (!core_clock_updated) {
+ update_rq_clock(rq->core);
+ core_clock_updated = true;
+ }
+ sched_core_account_forceidle(rq);
+ /* reset after accounting force idle */
+ rq->core->core_forceidle_start = 0;
+ rq->core->core_forceidle_count = 0;
+ rq->core->core_forceidle_occupation = 0;
need_sync = true;
fi_before = true;
- rq->core->core_forceidle = false;
}
/*
@@ -5753,7 +5774,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
for_each_cpu_wrap(i, smt_mask, cpu) {
rq_i = cpu_rq(i);
- if (i != cpu)
+ /*
+ * Current cpu always has its clock updated on entrance to
+ * pick_next_task(). If the current cpu is not the core,
+ * the core may also have been updated above.
+ */
+ if (i != cpu && (rq_i != rq->core || !core_clock_updated))
update_rq_clock(rq_i);
p = rq_i->core_pick = pick_task(rq_i);
@@ -5783,7 +5809,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (p == rq_i->idle) {
if (rq_i->nr_running) {
- rq->core->core_forceidle = true;
+ rq->core->core_forceidle_count++;
if (!fi_before)
rq->core->core_forceidle_seq++;
}
@@ -5792,6 +5818,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
}
+ if (schedstat_enabled() && rq->core->core_forceidle_count) {
+ if (cookie)
+ rq->core->core_forceidle_start = rq_clock(rq->core);
+ rq->core->core_forceidle_occupation = occ;
+ }
+
rq->core->core_pick_seq = rq->core->core_task_seq;
next = rq->core_pick;
rq->core_sched_seq = rq->core->core_pick_seq;
@@ -5828,8 +5860,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* 1 0 1
* 1 1 0
*/
- if (!(fi_before && rq->core->core_forceidle))
- task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
+ if (!(fi_before && rq->core->core_forceidle_count))
+ task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count);
rq_i->core_pick->core_occupation = occ;
@@ -6033,11 +6065,19 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
goto unlock;
/* copy the shared state to the new leader */
- core_rq->core_task_seq = rq->core_task_seq;
- core_rq->core_pick_seq = rq->core_pick_seq;
- core_rq->core_cookie = rq->core_cookie;
- core_rq->core_forceidle = rq->core_forceidle;
- core_rq->core_forceidle_seq = rq->core_forceidle_seq;
+ core_rq->core_task_seq = rq->core_task_seq;
+ core_rq->core_pick_seq = rq->core_pick_seq;
+ core_rq->core_cookie = rq->core_cookie;
+ core_rq->core_forceidle_count = rq->core_forceidle_count;
+ core_rq->core_forceidle_seq = rq->core_forceidle_seq;
+ core_rq->core_forceidle_occupation = rq->core_forceidle_occupation;
+
+ /*
+ * Accounting edge for forced idle is handled in pick_next_task().
+ * Don't need another one here, since the hotplug thread shouldn't
+ * have a cookie.
+ */
+ core_rq->core_forceidle_start = 0;
/* install new leader */
for_each_cpu(t, smt_mask) {
@@ -9413,7 +9453,9 @@ void __init sched_init(void)
rq->core_pick = NULL;
rq->core_enabled = 0;
rq->core_tree = RB_ROOT;
- rq->core_forceidle = false;
+ rq->core_forceidle_count = 0;
+ rq->core_forceidle_occupation = 0;
+ rq->core_forceidle_start = 0;
rq->core_cookie = 0UL;
#endif