From c530a3c716b963625e43aa915e0de6b4d1ce8ad9 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 26 Aug 2022 00:41:02 +0800 Subject: sched/psi: Fix periodic aggregation shut off We don't want to wake periodic aggregation work back up if the task change is the aggregation worker itself going to sleep, or we'll ping-pong forever. Previously, we would use psi_task_change() in psi_dequeue() when task going to sleep, so this check was put in psi_task_change(). But commit 4117cebf1a9f ("psi: Optimize task switch inside shared cgroups") defer task sleep handling to psi_task_switch(), won't go through psi_task_change() anymore. So this patch move this check to psi_task_switch(). Fixes: 4117cebf1a9f ("psi: Optimize task switch inside shared cgroups") Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220825164111.29534-2-zhouchengming@bytedance.com --- kernel/sched/psi.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel/sched/psi.c') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ecb4b4ff4ce0..39463dcc16bb 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -796,7 +796,6 @@ void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); struct psi_group *group; - bool wake_clock = true; void *iter = NULL; u64 now; @@ -806,19 +805,9 @@ void psi_task_change(struct task_struct *task, int clear, int set) psi_flags_change(task, clear, set); now = cpu_clock(cpu); - /* - * Periodic aggregation shuts off if there is a period of no - * task changes, so we wake it back up if necessary. However, - * don't do this if the task change is the aggregation worker - * itself going to sleep, or we'll ping-pong forever. - */ - if (unlikely((clear & TSK_RUNNING) && - (task->flags & PF_WQ_WORKER) && - wq_worker_last_func(task) == psi_avgs_work)) - wake_clock = false; while ((group = iterate_groups(task, &iter))) - psi_group_change(group, cpu, clear, set, now, wake_clock); + psi_group_change(group, cpu, clear, set, now, true); } void psi_task_switch(struct task_struct *prev, struct task_struct *next, @@ -854,6 +843,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, if (prev->pid) { int clear = TSK_ONCPU, set = 0; + bool wake_clock = true; /* * When we're going to sleep, psi_dequeue() lets us @@ -867,13 +857,23 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, clear |= TSK_MEMSTALL_RUNNING; if (prev->in_iowait) set |= TSK_IOWAIT; + + /* + * Periodic aggregation shuts off if there is a period of no + * task changes, so we wake it back up if necessary. However, + * don't do this if the task change is the aggregation worker + * itself going to sleep, or we'll ping-pong forever. + */ + if (unlikely((prev->flags & PF_WQ_WORKER) && + wq_worker_last_func(prev) == psi_avgs_work)) + wake_clock = false; } psi_flags_change(prev, clear, set); iter = NULL; while ((group = iterate_groups(prev, &iter)) && group != common) - psi_group_change(group, cpu, clear, set, now, true); + psi_group_change(group, cpu, clear, set, now, wake_clock); /* * TSK_ONCPU is handled up to the common ancestor. If we're tasked @@ -882,7 +882,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, if (sleep) { clear &= ~TSK_ONCPU; for (; group; group = iterate_groups(prev, &iter)) - psi_group_change(group, cpu, clear, set, now, true); + psi_group_change(group, cpu, clear, set, now, wake_clock); } } } -- cgit From e2ad8ab04c5cdfc8dc2f382c45d248ab01dee991 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 26 Aug 2022 00:41:04 +0800 Subject: sched/psi: Save percpu memory when !psi_cgroups_enabled We won't use cgroup psi_group when !psi_cgroups_enabled, so don't bother to alloc percpu memory and init for it. Also don't need to migrate task PSI stats between cgroups in cgroup_move_task(). Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220825164111.29534-4-zhouchengming@bytedance.com --- kernel/sched/psi.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel/sched/psi.c') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 39463dcc16bb..77d53c03a76f 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -201,6 +201,7 @@ void __init psi_init(void) { if (!psi_enable) { static_branch_enable(&psi_disabled); + static_branch_disable(&psi_cgroups_enabled); return; } @@ -950,7 +951,7 @@ void psi_memstall_leave(unsigned long *flags) #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgroup) { - if (static_branch_likely(&psi_disabled)) + if (!static_branch_likely(&psi_cgroups_enabled)) return 0; cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL); @@ -968,7 +969,7 @@ int psi_cgroup_alloc(struct cgroup *cgroup) void psi_cgroup_free(struct cgroup *cgroup) { - if (static_branch_likely(&psi_disabled)) + if (!static_branch_likely(&psi_cgroups_enabled)) return; cancel_delayed_work_sync(&cgroup->psi->avgs_work); @@ -996,7 +997,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) struct rq_flags rf; struct rq *rq; - if (static_branch_likely(&psi_disabled)) { + if (!static_branch_likely(&psi_cgroups_enabled)) { /* * Lame to do this here, but the scheduler cannot be locked * from the outside, so we move cgroups from inside sched/. -- cgit From 65176f59a18d888684525658a1d0b8bf749d24f3 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 26 Aug 2022 00:41:06 +0800 Subject: sched/psi: Optimize task switch inside shared cgroups again Way back when PSI_MEM_FULL was accounted from the timer tick, task switching could simply iterate next and prev to the common ancestor to update TSK_ONCPU and be done. Then memstall ticks were replaced with checking curr->in_memstall directly in psi_group_change(). That meant that now if the task switch was between a memstall and a !memstall task, we had to iterate through the common ancestors at least ONCE to fix up their state_masks. We added the identical_state filter to make sure the common ancestor elimination was skipped in that case. It seems that was always a little too eager, because it caused us to walk the common ancestors *twice* instead of the required once: the iteration for next could have stopped at the common ancestor; prev could have updated TSK_ONCPU up to the common ancestor, then finish to the root without changing any flags, just to get the new curr->in_memstall into the state_masks. This patch recognizes this and makes it so that we walk to the root exactly once if state_mask needs updating, which is simply catching up on a missed optimization that could have been done in commit 7fae6c8171d2 ("psi: Use ONCPU state tracking machinery to detect reclaim") directly. Apart from this, it's also necessary for the next patch "sched/psi: remove NR_ONCPU task accounting". Suppose we walk the common ancestors twice: (1) psi_group_change(.clear = 0, .set = TSK_ONCPU) (2) psi_group_change(.clear = TSK_ONCPU, .set = 0) We previously used tasks[NR_ONCPU] to record TSK_ONCPU, tasks[NR_ONCPU]++ in (1) then tasks[NR_ONCPU]-- in (2), so tasks[NR_ONCPU] still be correct. The next patch change to use one bit in state mask to record TSK_ONCPU, PSI_ONCPU bit will be set in (1), but then be cleared in (2), which cause the psi_group_cpu has task running on CPU but without PSI_ONCPU bit set! With this patch, we will never walk the common ancestors twice, so won't have above problem. Suggested-by: Johannes Weiner Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220825164111.29534-6-zhouchengming@bytedance.com --- kernel/sched/psi.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'kernel/sched/psi.c') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 77d53c03a76f..d71dbc2356ff 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -820,20 +820,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, u64 now = cpu_clock(cpu); if (next->pid) { - bool identical_state; - psi_flags_change(next, 0, TSK_ONCPU); /* - * When switching between tasks that have an identical - * runtime state, the cgroup that contains both tasks - * we reach the first common ancestor. Iterate @next's - * ancestors only until we encounter @prev's ONCPU. + * Set TSK_ONCPU on @next's cgroups. If @next shares any + * ancestors with @prev, those will already have @prev's + * TSK_ONCPU bit set, and we can stop the iteration there. */ - identical_state = prev->psi_flags == next->psi_flags; iter = NULL; while ((group = iterate_groups(next, &iter))) { - if (identical_state && - per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { + if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { common = group; break; } @@ -877,10 +872,12 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, psi_group_change(group, cpu, clear, set, now, wake_clock); /* - * TSK_ONCPU is handled up to the common ancestor. If we're tasked - * with dequeuing too, finish that for the rest of the hierarchy. + * TSK_ONCPU is handled up to the common ancestor. If there are + * any other differences between the two tasks (e.g. prev goes + * to sleep, or only one task is memstall), finish propagating + * those differences all the way up to the root. */ - if (sleep) { + if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { clear &= ~TSK_ONCPU; for (; group; group = iterate_groups(prev, &iter)) psi_group_change(group, cpu, clear, set, now, wake_clock); -- cgit From 71dbdde7914d32e86f01ac1f6e54e964c9dfdbd9 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Aug 2022 00:41:07 +0800 Subject: sched/psi: Remove NR_ONCPU task accounting We put all fields updated by the scheduler in the first cacheline of struct psi_group_cpu for performance. Since we want add another PSI_IRQ_FULL to track IRQ/SOFTIRQ pressure, we need to reclaim space first. This patch remove NR_ONCPU task accounting in struct psi_group_cpu, use one bit in state_mask to track instead. Signed-off-by: Johannes Weiner Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chengming Zhou Tested-by: Chengming Zhou Link: https://lore.kernel.org/r/20220825164111.29534-7-zhouchengming@bytedance.com --- kernel/sched/psi.c | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) (limited to 'kernel/sched/psi.c') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index d71dbc2356ff..4702a770e272 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -212,7 +212,7 @@ void __init psi_init(void) group_init(&psi_system); } -static bool test_state(unsigned int *tasks, enum psi_states state) +static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu) { switch (state) { case PSI_IO_SOME: @@ -225,9 +225,9 @@ static bool test_state(unsigned int *tasks, enum psi_states state) return unlikely(tasks[NR_MEMSTALL] && tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); case PSI_CPU_SOME: - return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]); + return unlikely(tasks[NR_RUNNING] > oncpu); case PSI_CPU_FULL: - return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]); + return unlikely(tasks[NR_RUNNING] && !oncpu); case PSI_NONIDLE: return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]; @@ -689,9 +689,9 @@ static void psi_group_change(struct psi_group *group, int cpu, bool wake_clock) { struct psi_group_cpu *groupc; - u32 state_mask = 0; unsigned int t, m; enum psi_states s; + u32 state_mask; groupc = per_cpu_ptr(group->pcpu, cpu); @@ -707,17 +707,36 @@ static void psi_group_change(struct psi_group *group, int cpu, record_times(groupc, now); + /* + * Start with TSK_ONCPU, which doesn't have a corresponding + * task count - it's just a boolean flag directly encoded in + * the state mask. Clear, set, or carry the current state if + * no changes are requested. + */ + if (unlikely(clear & TSK_ONCPU)) { + state_mask = 0; + clear &= ~TSK_ONCPU; + } else if (unlikely(set & TSK_ONCPU)) { + state_mask = PSI_ONCPU; + set &= ~TSK_ONCPU; + } else { + state_mask = groupc->state_mask & PSI_ONCPU; + } + + /* + * The rest of the state mask is calculated based on the task + * counts. Update those first, then construct the mask. + */ for (t = 0, m = clear; m; m &= ~(1 << t), t++) { if (!(m & (1 << t))) continue; if (groupc->tasks[t]) { groupc->tasks[t]--; } else if (!psi_bug) { - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n", + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", cpu, t, groupc->tasks[0], groupc->tasks[1], groupc->tasks[2], - groupc->tasks[3], groupc->tasks[4], - clear, set); + groupc->tasks[3], clear, set); psi_bug = 1; } } @@ -726,9 +745,8 @@ static void psi_group_change(struct psi_group *group, int cpu, if (set & (1 << t)) groupc->tasks[t]++; - /* Calculate state mask representing active states */ for (s = 0; s < NR_PSI_STATES; s++) { - if (test_state(groupc->tasks, s)) + if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU)) state_mask |= (1 << s); } @@ -740,7 +758,7 @@ static void psi_group_change(struct psi_group *group, int cpu, * task in a cgroup is in_memstall, the corresponding groupc * on that cpu is in PSI_MEM_FULL state. */ - if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)) + if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall)) state_mask |= (1 << PSI_MEM_FULL); groupc->state_mask = state_mask; @@ -828,7 +846,8 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, */ iter = NULL; while ((group = iterate_groups(next, &iter))) { - if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { + if (per_cpu_ptr(group->pcpu, cpu)->state_mask & + PSI_ONCPU) { common = group; break; } -- cgit From 52b1364ba0b105122d6de0e719b36db705011ac1 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 26 Aug 2022 00:41:08 +0800 Subject: sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure Now PSI already tracked workload pressure stall information for CPU, memory and IO. Apart from these, IRQ/SOFTIRQ could have obvious impact on some workload productivity, such as web service workload. When CONFIG_IRQ_TIME_ACCOUNTING, we can get IRQ/SOFTIRQ delta time from update_rq_clock_task(), in which we can record that delta to CPU curr task's cgroups as PSI_IRQ_FULL status. Note we don't use PSI_IRQ_SOME since IRQ/SOFTIRQ always happen in the current task on the CPU, make nothing productive could run even if it were runnable, so we only use PSI_IRQ_FULL. Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220825164111.29534-8-zhouchengming@bytedance.com --- kernel/sched/psi.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 2 deletions(-) (limited to 'kernel/sched/psi.c') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 4702a770e272..2545a78f82d8 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -904,6 +904,36 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, } } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +void psi_account_irqtime(struct task_struct *task, u32 delta) +{ + int cpu = task_cpu(task); + void *iter = NULL; + struct psi_group *group; + struct psi_group_cpu *groupc; + u64 now; + + if (!task->pid) + return; + + now = cpu_clock(cpu); + + while ((group = iterate_groups(task, &iter))) { + groupc = per_cpu_ptr(group->pcpu, cpu); + + write_seqcount_begin(&groupc->seq); + + record_times(groupc, now); + groupc->times[PSI_IRQ_FULL] += delta; + + write_seqcount_end(&groupc->seq); + + if (group->poll_states & (1 << PSI_IRQ_FULL)) + psi_schedule_poll_work(group, 1); + } +} +#endif + /** * psi_memstall_enter - mark the beginning of a memory stall section * @flags: flags to handle nested sections @@ -1065,6 +1095,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { + bool only_full = false; int full; u64 now; @@ -1079,7 +1110,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) group->avg_next_update = update_averages(group, now); mutex_unlock(&group->avgs_lock); - for (full = 0; full < 2; full++) { +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + only_full = res == PSI_IRQ; +#endif + + for (full = 0; full < 2 - only_full; full++) { unsigned long avg[3] = { 0, }; u64 total = 0; int w; @@ -1093,7 +1128,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) } seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", - full ? "full" : "some", + full || only_full ? "full" : "some", LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), @@ -1121,6 +1156,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, else return ERR_PTR(-EINVAL); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + if (res == PSI_IRQ && --state != PSI_IRQ_FULL) + return ERR_PTR(-EINVAL); +#endif + if (state >= PSI_NONIDLE) return ERR_PTR(-EINVAL); @@ -1405,6 +1445,33 @@ static const struct proc_ops psi_cpu_proc_ops = { .proc_release = psi_fop_release, }; +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static int psi_irq_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IRQ); +} + +static int psi_irq_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_irq_show); +} + +static ssize_t psi_irq_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_IRQ); +} + +static const struct proc_ops psi_irq_proc_ops = { + .proc_open = psi_irq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = psi_irq_write, + .proc_poll = psi_fop_poll, + .proc_release = psi_fop_release, +}; +#endif + static int __init psi_proc_init(void) { if (psi_enable) { @@ -1412,6 +1479,9 @@ static int __init psi_proc_init(void) proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops); proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops); proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops); +#endif } return 0; } -- cgit From dc86aba751e2867244411adda1562f6664747019 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 26 Aug 2022 00:41:10 +0800 Subject: sched/psi: Cache parent psi_group to speed up group iteration We use iterate_groups() to iterate each level psi_group to update PSI stats, which is a very hot path. In current code, iterate_groups() have to use multiple branches and cgroup_parent() to get parent psi_group for each level, which is not very efficient. This patch cache parent psi_group in struct psi_group, only need to get psi_group of task itself first, then just use group->parent to iterate. Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220825164111.29534-10-zhouchengming@bytedance.com --- kernel/sched/psi.c | 49 +++++++++++++++++++------------------------------ 1 file changed, 19 insertions(+), 30 deletions(-) (limited to 'kernel/sched/psi.c') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 2545a78f82d8..9a8aee80a087 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -772,27 +772,12 @@ static void psi_group_change(struct psi_group *group, int cpu, schedule_delayed_work(&group->avgs_work, PSI_FREQ); } -static struct psi_group *iterate_groups(struct task_struct *task, void **iter) +static inline struct psi_group *task_psi_group(struct task_struct *task) { - if (*iter == &psi_system) - return NULL; - #ifdef CONFIG_CGROUPS - if (static_branch_likely(&psi_cgroups_enabled)) { - struct cgroup *cgroup = NULL; - - if (!*iter) - cgroup = task->cgroups->dfl_cgrp; - else - cgroup = cgroup_parent(*iter); - - if (cgroup && cgroup_parent(cgroup)) { - *iter = cgroup; - return cgroup_psi(cgroup); - } - } + if (static_branch_likely(&psi_cgroups_enabled)) + return cgroup_psi(task_dfl_cgroup(task)); #endif - *iter = &psi_system; return &psi_system; } @@ -815,7 +800,6 @@ void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); struct psi_group *group; - void *iter = NULL; u64 now; if (!task->pid) @@ -825,8 +809,10 @@ void psi_task_change(struct task_struct *task, int clear, int set) now = cpu_clock(cpu); - while ((group = iterate_groups(task, &iter))) + group = task_psi_group(task); + do { psi_group_change(group, cpu, clear, set, now, true); + } while ((group = group->parent)); } void psi_task_switch(struct task_struct *prev, struct task_struct *next, @@ -834,7 +820,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, { struct psi_group *group, *common = NULL; int cpu = task_cpu(prev); - void *iter; u64 now = cpu_clock(cpu); if (next->pid) { @@ -844,8 +829,8 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, * ancestors with @prev, those will already have @prev's * TSK_ONCPU bit set, and we can stop the iteration there. */ - iter = NULL; - while ((group = iterate_groups(next, &iter))) { + group = task_psi_group(next); + do { if (per_cpu_ptr(group->pcpu, cpu)->state_mask & PSI_ONCPU) { common = group; @@ -853,7 +838,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, } psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); - } + } while ((group = group->parent)); } if (prev->pid) { @@ -886,9 +871,12 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, psi_flags_change(prev, clear, set); - iter = NULL; - while ((group = iterate_groups(prev, &iter)) && group != common) + group = task_psi_group(prev); + do { + if (group == common) + break; psi_group_change(group, cpu, clear, set, now, wake_clock); + } while ((group = group->parent)); /* * TSK_ONCPU is handled up to the common ancestor. If there are @@ -898,7 +886,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, */ if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { clear &= ~TSK_ONCPU; - for (; group; group = iterate_groups(prev, &iter)) + for (; group; group = group->parent) psi_group_change(group, cpu, clear, set, now, wake_clock); } } @@ -908,7 +896,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, void psi_account_irqtime(struct task_struct *task, u32 delta) { int cpu = task_cpu(task); - void *iter = NULL; struct psi_group *group; struct psi_group_cpu *groupc; u64 now; @@ -918,7 +905,8 @@ void psi_account_irqtime(struct task_struct *task, u32 delta) now = cpu_clock(cpu); - while ((group = iterate_groups(task, &iter))) { + group = task_psi_group(task); + do { groupc = per_cpu_ptr(group->pcpu, cpu); write_seqcount_begin(&groupc->seq); @@ -930,7 +918,7 @@ void psi_account_irqtime(struct task_struct *task, u32 delta) if (group->poll_states & (1 << PSI_IRQ_FULL)) psi_schedule_poll_work(group, 1); - } + } while ((group = group->parent)); } #endif @@ -1010,6 +998,7 @@ int psi_cgroup_alloc(struct cgroup *cgroup) return -ENOMEM; } group_init(cgroup->psi); + cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup)); return 0; } -- cgit From 34f26a15611afb03c33df6819359d36f5b382589 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Wed, 7 Sep 2022 17:03:32 +0800 Subject: sched/psi: Per-cgroup PSI accounting disable/re-enable interface PSI accounts stalls for each cgroup separately and aggregates it at each level of the hierarchy. This may cause non-negligible overhead for some workloads when under deep level of the hierarchy. commit 3958e2d0c34e ("cgroup: make per-cgroup pressure stall tracking configurable") make PSI to skip per-cgroup stall accounting, only account system-wide to avoid this each level overhead. But for our use case, we also want leaf cgroup PSI stats accounted for userspace adjustment on that cgroup, apart from only system-wide adjustment. So this patch introduce a per-cgroup PSI accounting disable/re-enable interface "cgroup.pressure", which is a read-write single value file that allowed values are "0" and "1", the defaults is "1" so per-cgroup PSI stats is enabled by default. Implementation details: It should be relatively straight-forward to disable and re-enable state aggregation, time tracking, averaging on a per-cgroup level, if we can live with losing history from while it was disabled. I.e. the avgs will restart from 0, total= will have gaps. But it's hard or complex to stop/restart groupc->tasks[] updates, which is not implemented in this patch. So we always update groupc->tasks[] and PSI_ONCPU bit in psi_group_change() even when the cgroup PSI stats is disabled. Suggested-by: Johannes Weiner Suggested-by: Tejun Heo Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/20220907090332.2078-1-zhouchengming@bytedance.com --- kernel/sched/psi.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 63 insertions(+), 7 deletions(-) (limited to 'kernel/sched/psi.c') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 9a8aee80a087..9711827e31e5 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -181,6 +181,7 @@ static void group_init(struct psi_group *group) { int cpu; + group->enabled = true; for_each_possible_cpu(cpu) seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); group->avg_last_update = sched_clock(); @@ -696,17 +697,16 @@ static void psi_group_change(struct psi_group *group, int cpu, groupc = per_cpu_ptr(group->pcpu, cpu); /* - * First we assess the aggregate resource states this CPU's - * tasks have been in since the last change, and account any - * SOME and FULL time these may have resulted in. - * - * Then we update the task counts according to the state + * First we update the task counts according to the state * change requested through the @clear and @set bits. + * + * Then if the cgroup PSI stats accounting enabled, we + * assess the aggregate resource states this CPU's tasks + * have been in since the last change, and account any + * SOME and FULL time these may have resulted in. */ write_seqcount_begin(&groupc->seq); - record_times(groupc, now); - /* * Start with TSK_ONCPU, which doesn't have a corresponding * task count - it's just a boolean flag directly encoded in @@ -745,6 +745,23 @@ static void psi_group_change(struct psi_group *group, int cpu, if (set & (1 << t)) groupc->tasks[t]++; + if (!group->enabled) { + /* + * On the first group change after disabling PSI, conclude + * the current state and flush its time. This is unlikely + * to matter to the user, but aggregation (get_recent_times) + * may have already incorporated the live state into times_prev; + * avoid a delta sample underflow when PSI is later re-enabled. + */ + if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE))) + record_times(groupc, now); + + groupc->state_mask = state_mask; + + write_seqcount_end(&groupc->seq); + return; + } + for (s = 0; s < NR_PSI_STATES; s++) { if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU)) state_mask |= (1 << s); @@ -761,6 +778,8 @@ static void psi_group_change(struct psi_group *group, int cpu, if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall)) state_mask |= (1 << PSI_MEM_FULL); + record_times(groupc, now); + groupc->state_mask = state_mask; write_seqcount_end(&groupc->seq); @@ -907,6 +926,9 @@ void psi_account_irqtime(struct task_struct *task, u32 delta) group = task_psi_group(task); do { + if (!group->enabled) + continue; + groupc = per_cpu_ptr(group->pcpu, cpu); write_seqcount_begin(&groupc->seq); @@ -1080,6 +1102,40 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) task_rq_unlock(rq, task, &rf); } + +void psi_cgroup_restart(struct psi_group *group) +{ + int cpu; + + /* + * After we disable psi_group->enabled, we don't actually + * stop percpu tasks accounting in each psi_group_cpu, + * instead only stop test_state() loop, record_times() + * and averaging worker, see psi_group_change() for details. + * + * When disable cgroup PSI, this function has nothing to sync + * since cgroup pressure files are hidden and percpu psi_group_cpu + * would see !psi_group->enabled and only do task accounting. + * + * When re-enable cgroup PSI, this function use psi_group_change() + * to get correct state mask from test_state() loop on tasks[], + * and restart groupc->state_start from now, use .clear = .set = 0 + * here since no task status really changed. + */ + if (!group->enabled) + return; + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + u64 now; + + rq_lock_irq(rq, &rf); + now = cpu_clock(cpu); + psi_group_change(group, cpu, 0, 0, now, true); + rq_unlock_irq(rq, &rf); + } +} #endif /* CONFIG_CGROUPS */ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) -- cgit