diff options
Diffstat (limited to 'kernel/sched/psi.c')
-rw-r--r-- | kernel/sched/psi.c | 164 |
1 files changed, 90 insertions, 74 deletions
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 967732c0766c..db27b69fa92a 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -34,7 +34,10 @@ * delayed on that resource such that nobody is advancing and the CPU * goes idle. This leaves both workload and CPU unproductive. * - * (Naturally, the FULL state doesn't exist for the CPU resource.) + * Naturally, the FULL state doesn't exist for the CPU resource at the + * system level, but exist at the cgroup level, means all non-idle tasks + * in a cgroup are delayed on the CPU resource which used by others outside + * of the cgroup or throttled by the cgroup cpu.max configuration. * * SOME = nr_delayed_tasks != 0 * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 @@ -59,7 +62,7 @@ * states, we would have to conclude a CPU SOME pressure number of * 100%, since *somebody* is waiting on a runqueue at all * times. However, that is clearly not the amount of contention the - * workload is experiencing: only one out of 256 possible exceution + * workload is experiencing: only one out of 256 possible execution * threads will be contended at any given time, or about 0.4%. * * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any @@ -73,7 +76,7 @@ * we have to base our calculation on the number of non-idle tasks in * conjunction with the number of available CPUs, which is the number * of potential execution threads. SOME becomes then the proportion of - * delayed tasks to possibe threads, and FULL is the share of possible + * delayed tasks to possible threads, and FULL is the share of possible * threads that are unproductive due to delays: * * threads = min(nr_nonidle_tasks, nr_cpus) @@ -216,15 +219,17 @@ static bool test_state(unsigned int *tasks, enum psi_states state) { switch (state) { case PSI_IO_SOME: - return tasks[NR_IOWAIT]; + return unlikely(tasks[NR_IOWAIT]); case PSI_IO_FULL: - return tasks[NR_IOWAIT] && !tasks[NR_RUNNING]; + return unlikely(tasks[NR_IOWAIT] && !tasks[NR_RUNNING]); case PSI_MEM_SOME: - return tasks[NR_MEMSTALL]; + return unlikely(tasks[NR_MEMSTALL]); case PSI_MEM_FULL: - return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; + return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]); case PSI_CPU_SOME: - return tasks[NR_RUNNING] > tasks[NR_ONCPU]; + return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]); + case PSI_CPU_FULL: + return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]); case PSI_NONIDLE: return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]; @@ -441,7 +446,7 @@ static void psi_avgs_work(struct work_struct *work) mutex_unlock(&group->avgs_lock); } -/* Trigger tracking window manupulations */ +/* Trigger tracking window manipulations */ static void window_reset(struct psi_window *win, u64 now, u64 value, u64 prev_growth) { @@ -639,13 +644,10 @@ static void poll_timer_fn(struct timer_list *t) wake_up_interruptible(&group->poll_wait); } -static void record_times(struct psi_group_cpu *groupc, int cpu, - bool memstall_tick) +static void record_times(struct psi_group_cpu *groupc, u64 now) { u32 delta; - u64 now; - now = cpu_clock(cpu); delta = now - groupc->state_start; groupc->state_start = now; @@ -659,34 +661,20 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, groupc->times[PSI_MEM_SOME] += delta; if (groupc->state_mask & (1 << PSI_MEM_FULL)) groupc->times[PSI_MEM_FULL] += delta; - else if (memstall_tick) { - u32 sample; - /* - * Since we care about lost potential, a - * memstall is FULL when there are no other - * working tasks, but also when the CPU is - * actively reclaiming and nothing productive - * could run even if it were runnable. - * - * When the timer tick sees a reclaiming CPU, - * regardless of runnable tasks, sample a FULL - * tick (or less if it hasn't been a full tick - * since the last state change). - */ - sample = min(delta, (u32)jiffies_to_nsecs(1)); - groupc->times[PSI_MEM_FULL] += sample; - } } - if (groupc->state_mask & (1 << PSI_CPU_SOME)) + if (groupc->state_mask & (1 << PSI_CPU_SOME)) { groupc->times[PSI_CPU_SOME] += delta; + if (groupc->state_mask & (1 << PSI_CPU_FULL)) + groupc->times[PSI_CPU_FULL] += delta; + } if (groupc->state_mask & (1 << PSI_NONIDLE)) groupc->times[PSI_NONIDLE] += delta; } static void psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set, + unsigned int clear, unsigned int set, u64 now, bool wake_clock) { struct psi_group_cpu *groupc; @@ -706,19 +694,20 @@ static void psi_group_change(struct psi_group *group, int cpu, */ write_seqcount_begin(&groupc->seq); - record_times(groupc, cpu, false); + record_times(groupc, now); for (t = 0, m = clear; m; m &= ~(1 << t), t++) { if (!(m & (1 << t))) continue; - if (groupc->tasks[t] == 0 && !psi_bug) { + if (groupc->tasks[t]) { + groupc->tasks[t]--; + } else if (!psi_bug) { printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", cpu, t, groupc->tasks[0], groupc->tasks[1], groupc->tasks[2], groupc->tasks[3], clear, set); psi_bug = 1; } - groupc->tasks[t]--; } for (t = 0; set; set &= ~(1 << t), t++) @@ -730,6 +719,18 @@ static void psi_group_change(struct psi_group *group, int cpu, if (test_state(groupc->tasks, s)) state_mask |= (1 << s); } + + /* + * Since we care about lost potential, a memstall is FULL + * when there are no other working tasks, but also when + * the CPU is actively reclaiming and nothing productive + * could run even if it were runnable. So when the current + * task in a cgroup is in_memstall, the corresponding groupc + * on that cpu is in PSI_MEM_FULL state. + */ + if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)) + state_mask |= (1 << PSI_MEM_FULL); + groupc->state_mask = state_mask; write_seqcount_end(&groupc->seq); @@ -786,12 +787,14 @@ void psi_task_change(struct task_struct *task, int clear, int set) struct psi_group *group; bool wake_clock = true; void *iter = NULL; + u64 now; if (!task->pid) return; psi_flags_change(task, clear, set); + now = cpu_clock(cpu); /* * Periodic aggregation shuts off if there is a period of no * task changes, so we wake it back up if necessary. However, @@ -804,7 +807,7 @@ void psi_task_change(struct task_struct *task, int clear, int set) wake_clock = false; while ((group = iterate_groups(task, &iter))) - psi_group_change(group, cpu, clear, set, wake_clock); + psi_group_change(group, cpu, clear, set, now, wake_clock); } void psi_task_switch(struct task_struct *prev, struct task_struct *next, @@ -813,56 +816,61 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, struct psi_group *group, *common = NULL; int cpu = task_cpu(prev); void *iter; + u64 now = cpu_clock(cpu); if (next->pid) { + bool identical_state; + psi_flags_change(next, 0, TSK_ONCPU); /* - * When moving state between tasks, the group that - * contains them both does not change: we can stop - * updating the tree once we reach the first common - * ancestor. Iterate @next's ancestors until we - * encounter @prev's state. + * When switching between tasks that have an identical + * runtime state, the cgroup that contains both tasks + * runtime state, the cgroup that contains both tasks + * we reach the first common ancestor. Iterate @next's + * ancestors only until we encounter @prev's ONCPU. */ + identical_state = prev->psi_flags == next->psi_flags; iter = NULL; while ((group = iterate_groups(next, &iter))) { - if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { + if (identical_state && + per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { common = group; break; } - psi_group_change(group, cpu, 0, TSK_ONCPU, true); + psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); } } - /* - * If this is a voluntary sleep, dequeue will have taken care - * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We - * only need to deal with it during preemption. - */ - if (sleep) - return; - if (prev->pid) { - psi_flags_change(prev, TSK_ONCPU, 0); + int clear = TSK_ONCPU, set = 0; - iter = NULL; - while ((group = iterate_groups(prev, &iter)) && group != common) - psi_group_change(group, cpu, TSK_ONCPU, 0, true); - } -} + /* + * When we're going to sleep, psi_dequeue() lets us handle + * TSK_RUNNING and TSK_IOWAIT here, where we can combine it + * with TSK_ONCPU and save walking common ancestors twice. + */ + if (sleep) { + clear |= TSK_RUNNING; + if (prev->in_iowait) + set |= TSK_IOWAIT; + } -void psi_memstall_tick(struct task_struct *task, int cpu) -{ - struct psi_group *group; - void *iter = NULL; + psi_flags_change(prev, clear, set); - while ((group = iterate_groups(task, &iter))) { - struct psi_group_cpu *groupc; + iter = NULL; + while ((group = iterate_groups(prev, &iter)) && group != common) + psi_group_change(group, cpu, clear, set, now, true); - groupc = per_cpu_ptr(group->pcpu, cpu); - write_seqcount_begin(&groupc->seq); - record_times(groupc, cpu, true); - write_seqcount_end(&groupc->seq); + /* + * TSK_ONCPU is handled up to the common ancestor. If we're tasked + * with dequeuing too, finish that for the rest of the hierarchy. + */ + if (sleep) { + clear &= ~TSK_ONCPU; + for (; group; group = iterate_groups(prev, &iter)) + psi_group_change(group, cpu, clear, set, now, true); + } } } @@ -1018,7 +1026,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) group->avg_next_update = update_averages(group, now); mutex_unlock(&group->avgs_lock); - for (full = 0; full < 2 - (res == PSI_CPU); full++) { + for (full = 0; full < 2; full++) { unsigned long avg[3]; u64 total; int w; @@ -1054,19 +1062,27 @@ static int psi_cpu_show(struct seq_file *m, void *v) return psi_show(m, &psi_system, PSI_CPU); } +static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *)) +{ + if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + return single_open(file, psi_show, NULL); +} + static int psi_io_open(struct inode *inode, struct file *file) { - return single_open(file, psi_io_show, NULL); + return psi_open(file, psi_io_show); } static int psi_memory_open(struct inode *inode, struct file *file) { - return single_open(file, psi_memory_show, NULL); + return psi_open(file, psi_memory_show); } static int psi_cpu_open(struct inode *inode, struct file *file) { - return single_open(file, psi_cpu_show, NULL); + return psi_open(file, psi_cpu_show); } struct psi_trigger *psi_trigger_create(struct psi_group *group, @@ -1346,9 +1362,9 @@ static int __init psi_proc_init(void) { if (psi_enable) { proc_mkdir("pressure", NULL); - proc_create("pressure/io", 0, NULL, &psi_io_proc_ops); - proc_create("pressure/memory", 0, NULL, &psi_memory_proc_ops); - proc_create("pressure/cpu", 0, NULL, &psi_cpu_proc_ops); + proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops); + proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops); + proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops); } return 0; } |