diff options
Diffstat (limited to 'kernel/sched/psi.c')
| -rw-r--r-- | kernel/sched/psi.c | 1342 |
1 files changed, 1126 insertions, 216 deletions
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index fe24de3fbc93..59fdb7ebbf22 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1,9 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Pressure stall information for CPU, memory and IO * * Copyright (c) 2018 Facebook, Inc. * Author: Johannes Weiner <hannes@cmpxchg.org> * + * Polling support by Suren Baghdasaryan <surenb@google.com> + * Copyright (c) 2018 Google, Inc. + * * When CPU, memory and IO are contended, tasks experience delays that * reduce throughput and introduce latencies into the workload. Memory * and IO contention, in addition, can cause a full loss of forward @@ -31,12 +35,21 @@ * delayed on that resource such that nobody is advancing and the CPU * goes idle. This leaves both workload and CPU unproductive. * - * (Naturally, the FULL state doesn't exist for the CPU resource.) - * * SOME = nr_delayed_tasks != 0 - * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 + * FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0 + * + * What it means for a task to be productive is defined differently + * for each resource. For IO, productive means a running task. For + * memory, productive means a running task that isn't a reclaimer. For + * CPU, productive means an on-CPU task. * - * The percentage of wallclock time spent in those compound stall + * Naturally, the FULL state doesn't exist for the CPU resource at the + * system level, but exist at the cgroup level. At the cgroup level, + * FULL means all non-idle tasks in the cgroup are delayed on the CPU + * resource which is being used by others outside of the cgroup or + * throttled by the cgroup cpu.max configuration. + * + * The percentage of wall clock time spent in those compound stall * states gives pressure numbers between 0 and 100 for each resource, * where the SOME percentage indicates workload slowdowns and the FULL * percentage indicates reduced CPU utilization: @@ -56,7 +69,7 @@ * states, we would have to conclude a CPU SOME pressure number of * 100%, since *somebody* is waiting on a runqueue at all * times. However, that is clearly not the amount of contention the - * workload is experiencing: only one out of 256 possible exceution + * workload is experiencing: only one out of 256 possible execution * threads will be contended at any given time, or about 0.4%. * * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any @@ -70,18 +83,18 @@ * we have to base our calculation on the number of non-idle tasks in * conjunction with the number of available CPUs, which is the number * of potential execution threads. SOME becomes then the proportion of - * delayed tasks to possibe threads, and FULL is the share of possible + * delayed tasks to possible threads, and FULL is the share of possible * threads that are unproductive due to delays: * * threads = min(nr_nonidle_tasks, nr_cpus) * SOME = min(nr_delayed_tasks / threads, 1) - * FULL = (threads - min(nr_running_tasks, threads)) / threads + * FULL = (threads - min(nr_productive_tasks, threads)) / threads * * For the 257 number crunchers on 256 CPUs, this yields: * * threads = min(257, 256) * SOME = min(1 / 256, 1) = 0.4% - * FULL = (256 - min(257, 256)) / 256 = 0% + * FULL = (256 - min(256, 256)) / 256 = 0% * * For the 1 out of 4 memory-delayed tasks, this yields: * @@ -106,7 +119,7 @@ * For each runqueue, we track: * * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0) - * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu]) + * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu]) * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0) * * and then periodically aggregate: @@ -123,25 +136,20 @@ * cost-wise, yet way more sensitive and accurate than periodic * sampling of the aggregate task states would be. */ - -#include <linux/sched/loadavg.h> -#include <linux/seq_file.h> -#include <linux/proc_fs.h> -#include <linux/seqlock.h> -#include <linux/cgroup.h> -#include <linux/module.h> -#include <linux/sched.h> +#include <linux/sched/clock.h> +#include <linux/workqueue.h> #include <linux/psi.h> #include "sched.h" static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); +static DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled); #ifdef CONFIG_PSI_DEFAULT_DISABLED -bool psi_enable; +static bool psi_enable; #else -bool psi_enable = true; +static bool psi_enable = true; #endif static int __init setup_psi(char *str) { @@ -155,76 +163,135 @@ __setup("psi=", setup_psi); #define EXP_60s 1981 /* 1/exp(2s/60s) */ #define EXP_300s 2034 /* 1/exp(2s/300s) */ +/* PSI trigger definitions */ +#define WINDOW_MAX_US 10000000 /* Max window size is 10s */ +#define UPDATES_PER_WINDOW 10 /* 10 updates per window */ + /* Sampling frequency in nanoseconds */ static u64 psi_period __read_mostly; /* System-level pressure and stall tracking */ static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu); -static struct psi_group psi_system = { +struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; -static void psi_update_work(struct work_struct *work); +static DEFINE_PER_CPU(seqcount_t, psi_seq) = SEQCNT_ZERO(psi_seq); -static void group_init(struct psi_group *group) +static inline void psi_write_begin(int cpu) { - int cpu; + write_seqcount_begin(per_cpu_ptr(&psi_seq, cpu)); +} + +static inline void psi_write_end(int cpu) +{ + write_seqcount_end(per_cpu_ptr(&psi_seq, cpu)); +} - for_each_possible_cpu(cpu) - seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); - group->next_update = sched_clock() + psi_period; - INIT_DELAYED_WORK(&group->clock_work, psi_update_work); - mutex_init(&group->stat_lock); +static inline u32 psi_read_begin(int cpu) +{ + return read_seqcount_begin(per_cpu_ptr(&psi_seq, cpu)); +} + +static inline bool psi_read_retry(int cpu, u32 seq) +{ + return read_seqcount_retry(per_cpu_ptr(&psi_seq, cpu), seq); +} + +static void psi_avgs_work(struct work_struct *work); + +static void poll_timer_fn(struct timer_list *t); + +static void group_init(struct psi_group *group) +{ + group->enabled = true; + group->avg_last_update = sched_clock(); + group->avg_next_update = group->avg_last_update + psi_period; + mutex_init(&group->avgs_lock); + + /* Init avg trigger-related members */ + INIT_LIST_HEAD(&group->avg_triggers); + memset(group->avg_nr_triggers, 0, sizeof(group->avg_nr_triggers)); + INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); + + /* Init rtpoll trigger-related members */ + atomic_set(&group->rtpoll_scheduled, 0); + mutex_init(&group->rtpoll_trigger_lock); + INIT_LIST_HEAD(&group->rtpoll_triggers); + group->rtpoll_min_period = U32_MAX; + group->rtpoll_next_update = ULLONG_MAX; + init_waitqueue_head(&group->rtpoll_wait); + timer_setup(&group->rtpoll_timer, poll_timer_fn, 0); + rcu_assign_pointer(group->rtpoll_task, NULL); } void __init psi_init(void) { if (!psi_enable) { static_branch_enable(&psi_disabled); + static_branch_disable(&psi_cgroups_enabled); return; } + if (!cgroup_psi_enabled()) + static_branch_disable(&psi_cgroups_enabled); + psi_period = jiffies_to_nsecs(PSI_FREQ); group_init(&psi_system); } -static bool test_state(unsigned int *tasks, enum psi_states state) +static u32 test_states(unsigned int *tasks, u32 state_mask) { - switch (state) { - case PSI_IO_SOME: - return tasks[NR_IOWAIT]; - case PSI_IO_FULL: - return tasks[NR_IOWAIT] && !tasks[NR_RUNNING]; - case PSI_MEM_SOME: - return tasks[NR_MEMSTALL]; - case PSI_MEM_FULL: - return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; - case PSI_CPU_SOME: - return tasks[NR_RUNNING] > 1; - case PSI_NONIDLE: - return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || - tasks[NR_RUNNING]; - default: - return false; + const bool oncpu = state_mask & PSI_ONCPU; + + if (tasks[NR_IOWAIT]) { + state_mask |= BIT(PSI_IO_SOME); + if (!tasks[NR_RUNNING]) + state_mask |= BIT(PSI_IO_FULL); } + + if (tasks[NR_MEMSTALL]) { + state_mask |= BIT(PSI_MEM_SOME); + if (tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]) + state_mask |= BIT(PSI_MEM_FULL); + } + + if (tasks[NR_RUNNING] > oncpu) + state_mask |= BIT(PSI_CPU_SOME); + + if (tasks[NR_RUNNING] && !oncpu) + state_mask |= BIT(PSI_CPU_FULL); + + if (tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]) + state_mask |= BIT(PSI_NONIDLE); + + return state_mask; } -static void get_recent_times(struct psi_group *group, int cpu, u32 *times) +static void get_recent_times(struct psi_group *group, int cpu, + enum psi_aggregators aggregator, u32 *times, + u32 *pchanged_states) { struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + int current_cpu = raw_smp_processor_id(); unsigned int tasks[NR_PSI_TASK_COUNTS]; u64 now, state_start; + enum psi_states s; unsigned int seq; - int s; + u32 state_mask; + + *pchanged_states = 0; /* Snapshot a coherent view of the CPU state */ do { - seq = read_seqcount_begin(&groupc->seq); + seq = psi_read_begin(cpu); now = cpu_clock(cpu); memcpy(times, groupc->times, sizeof(groupc->times)); - memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); + state_mask = groupc->state_mask; state_start = groupc->state_start; - } while (read_seqcount_retry(&groupc->seq, seq)); + if (cpu == current_cpu) + memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); + } while (psi_read_retry(cpu, seq)); /* Calculate state time deltas against the previous snapshot */ for (s = 0; s < NR_PSI_STATES; s++) { @@ -238,13 +305,37 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times) * (u32) and our reported pressure close to what's * actually happening. */ - if (test_state(tasks, s)) + if (state_mask & (1 << s)) times[s] += now - state_start; - delta = times[s] - groupc->times_prev[s]; - groupc->times_prev[s] = times[s]; + delta = times[s] - groupc->times_prev[aggregator][s]; + groupc->times_prev[aggregator][s] = times[s]; times[s] = delta; + if (delta) + *pchanged_states |= (1 << s); + } + + /* + * When collect_percpu_times() from the avgs_work, we don't want to + * re-arm avgs_work when all CPUs are IDLE. But the current CPU running + * this avgs_work is never IDLE, cause avgs_work can't be shut off. + * So for the current CPU, we need to re-arm avgs_work only when + * (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0), for other CPUs + * we can just check PSI_NONIDLE delta. + */ + if (current_work() == &group->avgs_work.work) { + bool reschedule; + + if (cpu == current_cpu) + reschedule = tasks[NR_RUNNING] + + tasks[NR_IOWAIT] + + tasks[NR_MEMSTALL] > 1; + else + reschedule = *pchanged_states & (1 << PSI_NONIDLE); + + if (reschedule) + *pchanged_states |= PSI_STATE_RESCHEDULE; } } @@ -268,20 +359,19 @@ static void calc_avgs(unsigned long avg[3], int missed_periods, avg[2] = calc_load(avg[2], EXP_300s, pct); } -static bool update_stats(struct psi_group *group) +static void collect_percpu_times(struct psi_group *group, + enum psi_aggregators aggregator, + u32 *pchanged_states) { u64 deltas[NR_PSI_STATES - 1] = { 0, }; - unsigned long missed_periods = 0; unsigned long nonidle_total = 0; - u64 now, expires, period; + u32 changed_states = 0; int cpu; int s; - mutex_lock(&group->stat_lock); - /* * Collect the per-cpu time buckets and average them into a - * single time sample that is normalized to wallclock time. + * single time sample that is normalized to wall clock time. * * For averaging, each CPU is weighted by its non-idle time in * the sampling period. This eliminates artifacts from uneven @@ -290,8 +380,11 @@ static bool update_stats(struct psi_group *group) for_each_possible_cpu(cpu) { u32 times[NR_PSI_STATES]; u32 nonidle; + u32 cpu_changed_states; - get_recent_times(group, cpu, times); + get_recent_times(group, cpu, aggregator, times, + &cpu_changed_states); + changed_states |= cpu_changed_states; nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]); nonidle_total += nonidle; @@ -314,14 +407,131 @@ static bool update_stats(struct psi_group *group) /* total= */ for (s = 0; s < NR_PSI_STATES - 1; s++) - group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); + group->total[aggregator][s] += + div_u64(deltas[s], max(nonidle_total, 1UL)); + + if (pchanged_states) + *pchanged_states = changed_states; +} + +/* Trigger tracking window manipulations */ +static void window_reset(struct psi_window *win, u64 now, u64 value, + u64 prev_growth) +{ + win->start_time = now; + win->start_value = value; + win->prev_growth = prev_growth; +} + +/* + * PSI growth tracking window update and growth calculation routine. + * + * This approximates a sliding tracking window by interpolating + * partially elapsed windows using historical growth data from the + * previous intervals. This minimizes memory requirements (by not storing + * all the intermediate values in the previous window) and simplifies + * the calculations. It works well because PSI signal changes only in + * positive direction and over relatively small window sizes the growth + * is close to linear. + */ +static u64 window_update(struct psi_window *win, u64 now, u64 value) +{ + u64 elapsed; + u64 growth; + + elapsed = now - win->start_time; + growth = value - win->start_value; + /* + * After each tracking window passes win->start_value and + * win->start_time get reset and win->prev_growth stores + * the average per-window growth of the previous window. + * win->prev_growth is then used to interpolate additional + * growth from the previous window assuming it was linear. + */ + if (elapsed > win->size) + window_reset(win, now, value, growth); + else { + u32 remaining; + + remaining = win->size - elapsed; + growth += div64_u64(win->prev_growth * remaining, win->size); + } + + return growth; +} + +static void update_triggers(struct psi_group *group, u64 now, + enum psi_aggregators aggregator) +{ + struct psi_trigger *t; + u64 *total = group->total[aggregator]; + struct list_head *triggers; + u64 *aggregator_total; + + if (aggregator == PSI_AVGS) { + triggers = &group->avg_triggers; + aggregator_total = group->avg_total; + } else { + triggers = &group->rtpoll_triggers; + aggregator_total = group->rtpoll_total; + } + + /* + * On subsequent updates, calculate growth deltas and let + * watchers know when their specified thresholds are exceeded. + */ + list_for_each_entry(t, triggers, node) { + u64 growth; + bool new_stall; + + new_stall = aggregator_total[t->state] != total[t->state]; + + /* Check for stall activity or a previous threshold breach */ + if (!new_stall && !t->pending_event) + continue; + /* + * Check for new stall activity, as well as deferred + * events that occurred in the last window after the + * trigger had already fired (we want to ratelimit + * events without dropping any). + */ + if (new_stall) { + /* Calculate growth since last update */ + growth = window_update(&t->win, now, total[t->state]); + if (!t->pending_event) { + if (growth < t->threshold) + continue; + + t->pending_event = true; + } + } + /* Limit event signaling to once per window */ + if (now < t->last_event_time + t->win.size) + continue; + + /* Generate an event */ + if (cmpxchg(&t->event, 0, 1) == 0) { + if (t->of) + kernfs_notify(t->of->kn); + else + wake_up_interruptible(&t->event_wait); + } + t->last_event_time = now; + /* Reset threshold breach flag once event got generated */ + t->pending_event = false; + } +} + +static u64 update_averages(struct psi_group *group, u64 now) +{ + unsigned long missed_periods = 0; + u64 expires, period; + u64 avg_next_update; + int s; /* avgX= */ - now = sched_clock(); - expires = group->next_update; - if (now < expires) - goto out; - if (now - expires > psi_period) + expires = group->avg_next_update; + if (now - expires >= psi_period) missed_periods = div_u64(now - expires, psi_period); /* @@ -331,14 +541,14 @@ static bool update_stats(struct psi_group *group) * But the deltas we sample out of the per-cpu buckets above * are based on the actual time elapsing between clock ticks. */ - group->next_update = expires + ((1 + missed_periods) * psi_period); - period = now - (group->last_update + (missed_periods * psi_period)); - group->last_update = now; + avg_next_update = expires + ((1 + missed_periods) * psi_period); + period = now - (group->avg_last_update + (missed_periods * psi_period)); + group->avg_last_update = now; for (s = 0; s < NR_PSI_STATES - 1; s++) { u32 sample; - sample = group->total[s] - group->total_prev[s]; + sample = group->total[PSI_AVGS][s] - group->avg_total[s]; /* * Due to the lockless sampling of the time buckets, * recorded time deltas can slip into the next period, @@ -358,23 +568,28 @@ static bool update_stats(struct psi_group *group) */ if (sample > period) sample = period; - group->total_prev[s] += sample; + group->avg_total[s] += sample; calc_avgs(group->avg[s], missed_periods, sample, period); } -out: - mutex_unlock(&group->stat_lock); - return nonidle_total; + + return avg_next_update; } -static void psi_update_work(struct work_struct *work) +static void psi_avgs_work(struct work_struct *work) { struct delayed_work *dwork; struct psi_group *group; - bool nonidle; + u32 changed_states; + u64 now; dwork = to_delayed_work(work); - group = container_of(dwork, struct psi_group, clock_work); + group = container_of(dwork, struct psi_group, avgs_work); + + mutex_lock(&group->avgs_lock); + + now = sched_clock(); + collect_percpu_times(group, PSI_AVGS, &changed_states); /* * If there is task activity, periodically fold the per-cpu * times and feed samples into the running averages. If things @@ -382,172 +597,454 @@ static void psi_update_work(struct work_struct *work) * Once restarted, we'll catch up the running averages in one * go - see calc_avgs() and missed_periods. */ + if (now >= group->avg_next_update) { + update_triggers(group, now, PSI_AVGS); + group->avg_next_update = update_averages(group, now); + } - nonidle = update_stats(group); + if (changed_states & PSI_STATE_RESCHEDULE) { + schedule_delayed_work(dwork, nsecs_to_jiffies( + group->avg_next_update - now) + 1); + } - if (nonidle) { - unsigned long delay = 0; - u64 now; + mutex_unlock(&group->avgs_lock); +} - now = sched_clock(); - if (group->next_update > now) - delay = nsecs_to_jiffies(group->next_update - now) + 1; - schedule_delayed_work(dwork, delay); +static void init_rtpoll_triggers(struct psi_group *group, u64 now) +{ + struct psi_trigger *t; + + list_for_each_entry(t, &group->rtpoll_triggers, node) + window_reset(&t->win, now, + group->total[PSI_POLL][t->state], 0); + memcpy(group->rtpoll_total, group->total[PSI_POLL], + sizeof(group->rtpoll_total)); + group->rtpoll_next_update = now + group->rtpoll_min_period; +} + +/* Schedule rtpolling if it's not already scheduled or forced. */ +static void psi_schedule_rtpoll_work(struct psi_group *group, unsigned long delay, + bool force) +{ + struct task_struct *task; + + /* + * atomic_xchg should be called even when !force to provide a + * full memory barrier (see the comment inside psi_rtpoll_work). + */ + if (atomic_xchg(&group->rtpoll_scheduled, 1) && !force) + return; + + rcu_read_lock(); + + task = rcu_dereference(group->rtpoll_task); + /* + * kworker might be NULL in case psi_trigger_destroy races with + * psi_task_change (hotpath) which can't use locks + */ + if (likely(task)) + mod_timer(&group->rtpoll_timer, jiffies + delay); + else + atomic_set(&group->rtpoll_scheduled, 0); + + rcu_read_unlock(); +} + +static void psi_rtpoll_work(struct psi_group *group) +{ + bool force_reschedule = false; + u32 changed_states; + u64 now; + + mutex_lock(&group->rtpoll_trigger_lock); + + now = sched_clock(); + + if (now > group->rtpoll_until) { + /* + * We are either about to start or might stop rtpolling if no + * state change was recorded. Resetting rtpoll_scheduled leaves + * a small window for psi_group_change to sneak in and schedule + * an immediate rtpoll_work before we get to rescheduling. One + * potential extra wakeup at the end of the rtpolling window + * should be negligible and rtpoll_next_update still keeps + * updates correctly on schedule. + */ + atomic_set(&group->rtpoll_scheduled, 0); + /* + * A task change can race with the rtpoll worker that is supposed to + * report on it. To avoid missing events, ensure ordering between + * rtpoll_scheduled and the task state accesses, such that if the + * rtpoll worker misses the state update, the task change is + * guaranteed to reschedule the rtpoll worker: + * + * rtpoll worker: + * atomic_set(rtpoll_scheduled, 0) + * smp_mb() + * LOAD states + * + * task change: + * STORE states + * if atomic_xchg(rtpoll_scheduled, 1) == 0: + * schedule rtpoll worker + * + * The atomic_xchg() implies a full barrier. + */ + smp_mb(); + } else { + /* The rtpolling window is not over, keep rescheduling */ + force_reschedule = true; } + + + collect_percpu_times(group, PSI_POLL, &changed_states); + + if (changed_states & group->rtpoll_states) { + /* Initialize trigger windows when entering rtpolling mode */ + if (now > group->rtpoll_until) + init_rtpoll_triggers(group, now); + + /* + * Keep the monitor active for at least the duration of the + * minimum tracking window as long as monitor states are + * changing. + */ + group->rtpoll_until = now + + group->rtpoll_min_period * UPDATES_PER_WINDOW; + } + + if (now > group->rtpoll_until) { + group->rtpoll_next_update = ULLONG_MAX; + goto out; + } + + if (now >= group->rtpoll_next_update) { + if (changed_states & group->rtpoll_states) { + update_triggers(group, now, PSI_POLL); + memcpy(group->rtpoll_total, group->total[PSI_POLL], + sizeof(group->rtpoll_total)); + } + group->rtpoll_next_update = now + group->rtpoll_min_period; + } + + psi_schedule_rtpoll_work(group, + nsecs_to_jiffies(group->rtpoll_next_update - now) + 1, + force_reschedule); + +out: + mutex_unlock(&group->rtpoll_trigger_lock); +} + +static int psi_rtpoll_worker(void *data) +{ + struct psi_group *group = (struct psi_group *)data; + + sched_set_fifo_low(current); + + while (true) { + wait_event_interruptible(group->rtpoll_wait, + atomic_cmpxchg(&group->rtpoll_wakeup, 1, 0) || + kthread_should_stop()); + if (kthread_should_stop()) + break; + + psi_rtpoll_work(group); + } + return 0; +} + +static void poll_timer_fn(struct timer_list *t) +{ + struct psi_group *group = timer_container_of(group, t, rtpoll_timer); + + atomic_set(&group->rtpoll_wakeup, 1); + wake_up_interruptible(&group->rtpoll_wait); } -static void record_times(struct psi_group_cpu *groupc, int cpu, - bool memstall_tick) +static void record_times(struct psi_group_cpu *groupc, u64 now) { u32 delta; - u64 now; - now = cpu_clock(cpu); delta = now - groupc->state_start; groupc->state_start = now; - if (test_state(groupc->tasks, PSI_IO_SOME)) { + if (groupc->state_mask & (1 << PSI_IO_SOME)) { groupc->times[PSI_IO_SOME] += delta; - if (test_state(groupc->tasks, PSI_IO_FULL)) + if (groupc->state_mask & (1 << PSI_IO_FULL)) groupc->times[PSI_IO_FULL] += delta; } - if (test_state(groupc->tasks, PSI_MEM_SOME)) { + if (groupc->state_mask & (1 << PSI_MEM_SOME)) { groupc->times[PSI_MEM_SOME] += delta; - if (test_state(groupc->tasks, PSI_MEM_FULL)) + if (groupc->state_mask & (1 << PSI_MEM_FULL)) groupc->times[PSI_MEM_FULL] += delta; - else if (memstall_tick) { - u32 sample; - /* - * Since we care about lost potential, a - * memstall is FULL when there are no other - * working tasks, but also when the CPU is - * actively reclaiming and nothing productive - * could run even if it were runnable. - * - * When the timer tick sees a reclaiming CPU, - * regardless of runnable tasks, sample a FULL - * tick (or less if it hasn't been a full tick - * since the last state change). - */ - sample = min(delta, (u32)jiffies_to_nsecs(1)); - groupc->times[PSI_MEM_FULL] += sample; - } } - if (test_state(groupc->tasks, PSI_CPU_SOME)) + if (groupc->state_mask & (1 << PSI_CPU_SOME)) { groupc->times[PSI_CPU_SOME] += delta; + if (groupc->state_mask & (1 << PSI_CPU_FULL)) + groupc->times[PSI_CPU_FULL] += delta; + } - if (test_state(groupc->tasks, PSI_NONIDLE)) + if (groupc->state_mask & (1 << PSI_NONIDLE)) groupc->times[PSI_NONIDLE] += delta; } +#define for_each_group(iter, group) \ + for (typeof(group) iter = group; iter; iter = iter->parent) + static void psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set) + unsigned int clear, unsigned int set, + u64 now, bool wake_clock) { struct psi_group_cpu *groupc; unsigned int t, m; + u32 state_mask; + lockdep_assert_rq_held(cpu_rq(cpu)); groupc = per_cpu_ptr(group->pcpu, cpu); /* - * First we assess the aggregate resource states this CPU's - * tasks have been in since the last change, and account any - * SOME and FULL time these may have resulted in. - * - * Then we update the task counts according to the state - * change requested through the @clear and @set bits. + * Start with TSK_ONCPU, which doesn't have a corresponding + * task count - it's just a boolean flag directly encoded in + * the state mask. Clear, set, or carry the current state if + * no changes are requested. */ - write_seqcount_begin(&groupc->seq); - - record_times(groupc, cpu, false); + if (unlikely(clear & TSK_ONCPU)) { + state_mask = 0; + clear &= ~TSK_ONCPU; + } else if (unlikely(set & TSK_ONCPU)) { + state_mask = PSI_ONCPU; + set &= ~TSK_ONCPU; + } else { + state_mask = groupc->state_mask & PSI_ONCPU; + } + /* + * The rest of the state mask is calculated based on the task + * counts. Update those first, then construct the mask. + */ for (t = 0, m = clear; m; m &= ~(1 << t), t++) { if (!(m & (1 << t))) continue; - if (groupc->tasks[t] == 0 && !psi_bug) { - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n", + if (groupc->tasks[t]) { + groupc->tasks[t]--; + } else if (!psi_bug) { + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", cpu, t, groupc->tasks[0], groupc->tasks[1], groupc->tasks[2], - clear, set); + groupc->tasks[3], clear, set); psi_bug = 1; } - groupc->tasks[t]--; } for (t = 0; set; set &= ~(1 << t), t++) if (set & (1 << t)) groupc->tasks[t]++; - write_seqcount_end(&groupc->seq); + if (!group->enabled) { + /* + * On the first group change after disabling PSI, conclude + * the current state and flush its time. This is unlikely + * to matter to the user, but aggregation (get_recent_times) + * may have already incorporated the live state into times_prev; + * avoid a delta sample underflow when PSI is later re-enabled. + */ + if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE))) + record_times(groupc, now); + + groupc->state_mask = state_mask; - if (!delayed_work_pending(&group->clock_work)) - schedule_delayed_work(&group->clock_work, PSI_FREQ); + return; + } + + state_mask = test_states(groupc->tasks, state_mask); + + /* + * Since we care about lost potential, a memstall is FULL + * when there are no other working tasks, but also when + * the CPU is actively reclaiming and nothing productive + * could run even if it were runnable. So when the current + * task in a cgroup is in_memstall, the corresponding groupc + * on that cpu is in PSI_MEM_FULL state. + */ + if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall)) + state_mask |= (1 << PSI_MEM_FULL); + + record_times(groupc, now); + + groupc->state_mask = state_mask; + + if (state_mask & group->rtpoll_states) + psi_schedule_rtpoll_work(group, 1, false); + + if (wake_clock && !delayed_work_pending(&group->avgs_work)) + schedule_delayed_work(&group->avgs_work, PSI_FREQ); } -static struct psi_group *iterate_groups(struct task_struct *task, void **iter) +static inline struct psi_group *task_psi_group(struct task_struct *task) { #ifdef CONFIG_CGROUPS - struct cgroup *cgroup = NULL; - - if (!*iter) - cgroup = task->cgroups->dfl_cgrp; - else if (*iter == &psi_system) - return NULL; - else - cgroup = cgroup_parent(*iter); - - if (cgroup && cgroup_parent(cgroup)) { - *iter = cgroup; - return cgroup_psi(cgroup); - } -#else - if (*iter) - return NULL; + if (static_branch_likely(&psi_cgroups_enabled)) + return cgroup_psi(task_dfl_cgroup(task)); #endif - *iter = &psi_system; return &psi_system; } -void psi_task_change(struct task_struct *task, int clear, int set) +static void psi_flags_change(struct task_struct *task, int clear, int set) { - int cpu = task_cpu(task); - struct psi_group *group; - void *iter = NULL; - - if (!task->pid) - return; - if (((task->psi_flags & set) || (task->psi_flags & clear) != clear) && !psi_bug) { printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n", - task->pid, task->comm, cpu, + task->pid, task->comm, task_cpu(task), task->psi_flags, clear, set); psi_bug = 1; } task->psi_flags &= ~clear; task->psi_flags |= set; +} + +void psi_task_change(struct task_struct *task, int clear, int set) +{ + int cpu = task_cpu(task); + u64 now; + + if (!task->pid) + return; + + psi_flags_change(task, clear, set); - while ((group = iterate_groups(task, &iter))) - psi_group_change(group, cpu, clear, set); + psi_write_begin(cpu); + now = cpu_clock(cpu); + for_each_group(group, task_psi_group(task)) + psi_group_change(group, cpu, clear, set, now, true); + psi_write_end(cpu); } -void psi_memstall_tick(struct task_struct *task, int cpu) +void psi_task_switch(struct task_struct *prev, struct task_struct *next, + bool sleep) { - struct psi_group *group; - void *iter = NULL; + struct psi_group *common = NULL; + int cpu = task_cpu(prev); + u64 now; + + psi_write_begin(cpu); + now = cpu_clock(cpu); + + if (next->pid) { + psi_flags_change(next, 0, TSK_ONCPU); + /* + * Set TSK_ONCPU on @next's cgroups. If @next shares any + * ancestors with @prev, those will already have @prev's + * TSK_ONCPU bit set, and we can stop the iteration there. + */ + for_each_group(group, task_psi_group(next)) { + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + + if (groupc->state_mask & PSI_ONCPU) { + common = group; + break; + } + psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); + } + } + + if (prev->pid) { + int clear = TSK_ONCPU, set = 0; + bool wake_clock = true; + + /* + * When we're going to sleep, psi_dequeue() lets us + * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and + * TSK_IOWAIT here, where we can combine it with + * TSK_ONCPU and save walking common ancestors twice. + */ + if (sleep) { + clear |= TSK_RUNNING; + if (prev->in_memstall) + clear |= TSK_MEMSTALL_RUNNING; + if (prev->in_iowait) + set |= TSK_IOWAIT; + + /* + * Periodic aggregation shuts off if there is a period of no + * task changes, so we wake it back up if necessary. However, + * don't do this if the task change is the aggregation worker + * itself going to sleep, or we'll ping-pong forever. + */ + if (unlikely((prev->flags & PF_WQ_WORKER) && + wq_worker_last_func(prev) == psi_avgs_work)) + wake_clock = false; + } + + psi_flags_change(prev, clear, set); + + for_each_group(group, task_psi_group(prev)) { + if (group == common) + break; + psi_group_change(group, cpu, clear, set, now, wake_clock); + } - while ((group = iterate_groups(task, &iter))) { - struct psi_group_cpu *groupc; + /* + * TSK_ONCPU is handled up to the common ancestor. If there are + * any other differences between the two tasks (e.g. prev goes + * to sleep, or only one task is memstall), finish propagating + * those differences all the way up to the root. + */ + if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { + clear &= ~TSK_ONCPU; + for_each_group(group, common) + psi_group_change(group, cpu, clear, set, now, wake_clock); + } + } + psi_write_end(cpu); +} + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) +{ + int cpu = task_cpu(curr); + struct psi_group_cpu *groupc; + s64 delta; + u64 irq; + u64 now; + + if (static_branch_likely(&psi_disabled) || !irqtime_enabled()) + return; + + if (!curr->pid) + return; + + lockdep_assert_rq_held(rq); + if (prev && task_psi_group(prev) == task_psi_group(curr)) + return; + + irq = irq_time_read(cpu); + delta = (s64)(irq - rq->psi_irq_time); + if (delta < 0) + return; + rq->psi_irq_time = irq; + + psi_write_begin(cpu); + now = cpu_clock(cpu); + + for_each_group(group, task_psi_group(curr)) { + if (!group->enabled) + continue; groupc = per_cpu_ptr(group->pcpu, cpu); - write_seqcount_begin(&groupc->seq); - record_times(groupc, cpu, true); - write_seqcount_end(&groupc->seq); + + record_times(groupc, now); + groupc->times[PSI_IRQ_FULL] += delta; + + if (group->rtpoll_states & (1 << PSI_IRQ_FULL)) + psi_schedule_rtpoll_work(group, 1, false); } + psi_write_end(cpu); } +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /** * psi_memstall_enter - mark the beginning of a memory stall section @@ -564,21 +1061,22 @@ void psi_memstall_enter(unsigned long *flags) if (static_branch_likely(&psi_disabled)) return; - *flags = current->flags & PF_MEMSTALL; + *flags = current->in_memstall; if (*flags) return; /* - * PF_MEMSTALL setting & accounting needs to be atomic wrt + * in_memstall setting & accounting needs to be atomic wrt * changes to the task's scheduling state, otherwise we can * race with CPU migration. */ rq = this_rq_lock_irq(&rf); - current->flags |= PF_MEMSTALL; - psi_task_change(current, 0, TSK_MEMSTALL); + current->in_memstall = 1; + psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING); rq_unlock_irq(rq, &rf); } +EXPORT_SYMBOL_GPL(psi_memstall_enter); /** * psi_memstall_leave - mark the end of an memory stall section @@ -597,38 +1095,49 @@ void psi_memstall_leave(unsigned long *flags) if (*flags) return; /* - * PF_MEMSTALL clearing & accounting needs to be atomic wrt + * in_memstall clearing & accounting needs to be atomic wrt * changes to the task's scheduling state, otherwise we could * race with CPU migration. */ rq = this_rq_lock_irq(&rf); - current->flags &= ~PF_MEMSTALL; - psi_task_change(current, TSK_MEMSTALL, 0); + current->in_memstall = 0; + psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0); rq_unlock_irq(rq, &rf); } +EXPORT_SYMBOL_GPL(psi_memstall_leave); #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgroup) { - if (static_branch_likely(&psi_disabled)) + if (!static_branch_likely(&psi_cgroups_enabled)) return 0; - cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); - if (!cgroup->psi.pcpu) + cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL); + if (!cgroup->psi) return -ENOMEM; - group_init(&cgroup->psi); + + cgroup->psi->pcpu = alloc_percpu(struct psi_group_cpu); + if (!cgroup->psi->pcpu) { + kfree(cgroup->psi); + return -ENOMEM; + } + group_init(cgroup->psi); + cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup)); return 0; } void psi_cgroup_free(struct cgroup *cgroup) { - if (static_branch_likely(&psi_disabled)) + if (!static_branch_likely(&psi_cgroups_enabled)) return; - cancel_delayed_work_sync(&cgroup->psi.clock_work); - free_percpu(cgroup->psi.pcpu); + cancel_delayed_work_sync(&cgroup->psi->avgs_work); + free_percpu(cgroup->psi->pcpu); + /* All triggers must be removed by now */ + WARN_ONCE(cgroup->psi->rtpoll_states, "psi: trigger leak\n"); + kfree(cgroup->psi); } /** @@ -645,11 +1154,11 @@ void psi_cgroup_free(struct cgroup *cgroup) */ void cgroup_move_task(struct task_struct *task, struct css_set *to) { - unsigned int task_flags = 0; + unsigned int task_flags; struct rq_flags rf; struct rq *rq; - if (static_branch_likely(&psi_disabled)) { + if (!static_branch_likely(&psi_cgroups_enabled)) { /* * Lame to do this here, but the scheduler cannot be locked * from the outside, so we move cgroups from inside sched/. @@ -660,13 +1169,31 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) rq = task_rq_lock(task, &rf); - if (task_on_rq_queued(task)) - task_flags = TSK_RUNNING; - else if (task->in_iowait) - task_flags = TSK_IOWAIT; - - if (task->flags & PF_MEMSTALL) - task_flags |= TSK_MEMSTALL; + /* + * We may race with schedule() dropping the rq lock between + * deactivating prev and switching to next. Because the psi + * updates from the deactivation are deferred to the switch + * callback to save cgroup tree updates, the task's scheduling + * state here is not coherent with its psi state: + * + * schedule() cgroup_move_task() + * rq_lock() + * deactivate_task() + * p->on_rq = 0 + * psi_dequeue() // defers TSK_RUNNING & TSK_IOWAIT updates + * pick_next_task() + * rq_unlock() + * rq_lock() + * psi_task_change() // old cgroup + * task->cgroups = to + * psi_task_change() // new cgroup + * rq_unlock() + * rq_lock() + * psi_sched_switch() // does deferred updates in new cgroup + * + * Don't rely on the scheduling state. Use psi_flags instead. + */ + task_flags = task->psi_flags; if (task_flags) psi_task_change(task, task_flags, 0); @@ -679,28 +1206,83 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) task_rq_unlock(rq, task, &rf); } + +void psi_cgroup_restart(struct psi_group *group) +{ + int cpu; + + /* + * After we disable psi_group->enabled, we don't actually + * stop percpu tasks accounting in each psi_group_cpu, + * instead only stop test_states() loop, record_times() + * and averaging worker, see psi_group_change() for details. + * + * When disable cgroup PSI, this function has nothing to sync + * since cgroup pressure files are hidden and percpu psi_group_cpu + * would see !psi_group->enabled and only do task accounting. + * + * When re-enable cgroup PSI, this function use psi_group_change() + * to get correct state mask from test_states() loop on tasks[], + * and restart groupc->state_start from now, use .clear = .set = 0 + * here since no task status really changed. + */ + if (!group->enabled) + return; + + for_each_possible_cpu(cpu) { + u64 now; + + guard(rq_lock_irq)(cpu_rq(cpu)); + + psi_write_begin(cpu); + now = cpu_clock(cpu); + psi_group_change(group, cpu, 0, 0, now, true); + psi_write_end(cpu); + } +} #endif /* CONFIG_CGROUPS */ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { + bool only_full = false; int full; + u64 now; if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; - update_stats(group); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + if (!irqtime_enabled() && res == PSI_IRQ) + return -EOPNOTSUPP; +#endif + + /* Update averages before reporting them */ + mutex_lock(&group->avgs_lock); + now = sched_clock(); + collect_percpu_times(group, PSI_AVGS, NULL); + if (now >= group->avg_next_update) + group->avg_next_update = update_averages(group, now); + mutex_unlock(&group->avgs_lock); + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + only_full = res == PSI_IRQ; +#endif - for (full = 0; full < 2 - (res == PSI_CPU); full++) { - unsigned long avg[3]; - u64 total; + for (full = 0; full < 2 - only_full; full++) { + unsigned long avg[3] = { 0, }; + u64 total = 0; int w; - for (w = 0; w < 3; w++) - avg[w] = group->avg[res * 2 + full][w]; - total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC); + /* CPU FULL is undefined at the system level */ + if (!(group == &psi_system && res == PSI_CPU && full)) { + for (w = 0; w < 3; w++) + avg[w] = group->avg[res * 2 + full][w]; + total = div_u64(group->total[PSI_AVGS][res * 2 + full], + NSEC_PER_USEC); + } seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", - full ? "full" : "some", + full || only_full ? "full" : "some", LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), @@ -710,6 +1292,218 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) return 0; } +struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, + enum psi_res res, struct file *file, + struct kernfs_open_file *of) +{ + struct psi_trigger *t; + enum psi_states state; + u32 threshold_us; + bool privileged; + u32 window_us; + + if (static_branch_likely(&psi_disabled)) + return ERR_PTR(-EOPNOTSUPP); + + /* + * Checking the privilege here on file->f_cred implies that a privileged user + * could open the file and delegate the write to an unprivileged one. + */ + privileged = cap_raised(file->f_cred->cap_effective, CAP_SYS_RESOURCE); + + if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2) + state = PSI_IO_SOME + res * 2; + else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2) + state = PSI_IO_FULL + res * 2; + else + return ERR_PTR(-EINVAL); + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + if (res == PSI_IRQ && --state != PSI_IRQ_FULL) + return ERR_PTR(-EINVAL); +#endif + + if (state >= PSI_NONIDLE) + return ERR_PTR(-EINVAL); + + if (window_us == 0 || window_us > WINDOW_MAX_US) + return ERR_PTR(-EINVAL); + + /* + * Unprivileged users can only use 2s windows so that averages aggregation + * work is used, and no RT threads need to be spawned. + */ + if (!privileged && window_us % 2000000) + return ERR_PTR(-EINVAL); + + /* Check threshold */ + if (threshold_us == 0 || threshold_us > window_us) + return ERR_PTR(-EINVAL); + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return ERR_PTR(-ENOMEM); + + t->group = group; + t->state = state; + t->threshold = threshold_us * NSEC_PER_USEC; + t->win.size = window_us * NSEC_PER_USEC; + window_reset(&t->win, sched_clock(), + group->total[PSI_POLL][t->state], 0); + + t->event = 0; + t->last_event_time = 0; + t->of = of; + if (!of) + init_waitqueue_head(&t->event_wait); + t->pending_event = false; + t->aggregator = privileged ? PSI_POLL : PSI_AVGS; + + if (privileged) { + mutex_lock(&group->rtpoll_trigger_lock); + + if (!rcu_access_pointer(group->rtpoll_task)) { + struct task_struct *task; + + task = kthread_create(psi_rtpoll_worker, group, "psimon"); + if (IS_ERR(task)) { + kfree(t); + mutex_unlock(&group->rtpoll_trigger_lock); + return ERR_CAST(task); + } + atomic_set(&group->rtpoll_wakeup, 0); + wake_up_process(task); + rcu_assign_pointer(group->rtpoll_task, task); + } + + list_add(&t->node, &group->rtpoll_triggers); + group->rtpoll_min_period = min(group->rtpoll_min_period, + div_u64(t->win.size, UPDATES_PER_WINDOW)); + group->rtpoll_nr_triggers[t->state]++; + group->rtpoll_states |= (1 << t->state); + + mutex_unlock(&group->rtpoll_trigger_lock); + } else { + mutex_lock(&group->avgs_lock); + + list_add(&t->node, &group->avg_triggers); + group->avg_nr_triggers[t->state]++; + + mutex_unlock(&group->avgs_lock); + } + return t; +} + +void psi_trigger_destroy(struct psi_trigger *t) +{ + struct psi_group *group; + struct task_struct *task_to_destroy = NULL; + + /* + * We do not check psi_disabled since it might have been disabled after + * the trigger got created. + */ + if (!t) + return; + + group = t->group; + /* + * Wakeup waiters to stop polling and clear the queue to prevent it from + * being accessed later. Can happen if cgroup is deleted from under a + * polling process. + */ + if (t->of) + kernfs_notify(t->of->kn); + else + wake_up_interruptible(&t->event_wait); + + if (t->aggregator == PSI_AVGS) { + mutex_lock(&group->avgs_lock); + if (!list_empty(&t->node)) { + list_del(&t->node); + group->avg_nr_triggers[t->state]--; + } + mutex_unlock(&group->avgs_lock); + } else { + mutex_lock(&group->rtpoll_trigger_lock); + if (!list_empty(&t->node)) { + struct psi_trigger *tmp; + u64 period = ULLONG_MAX; + + list_del(&t->node); + group->rtpoll_nr_triggers[t->state]--; + if (!group->rtpoll_nr_triggers[t->state]) + group->rtpoll_states &= ~(1 << t->state); + /* + * Reset min update period for the remaining triggers + * iff the destroying trigger had the min window size. + */ + if (group->rtpoll_min_period == div_u64(t->win.size, UPDATES_PER_WINDOW)) { + list_for_each_entry(tmp, &group->rtpoll_triggers, node) + period = min(period, div_u64(tmp->win.size, + UPDATES_PER_WINDOW)); + group->rtpoll_min_period = period; + } + /* Destroy rtpoll_task when the last trigger is destroyed */ + if (group->rtpoll_states == 0) { + group->rtpoll_until = 0; + task_to_destroy = rcu_dereference_protected( + group->rtpoll_task, + lockdep_is_held(&group->rtpoll_trigger_lock)); + rcu_assign_pointer(group->rtpoll_task, NULL); + timer_delete(&group->rtpoll_timer); + } + } + mutex_unlock(&group->rtpoll_trigger_lock); + } + + /* + * Wait for psi_schedule_rtpoll_work RCU to complete its read-side + * critical section before destroying the trigger and optionally the + * rtpoll_task. + */ + synchronize_rcu(); + /* + * Stop kthread 'psimon' after releasing rtpoll_trigger_lock to prevent + * a deadlock while waiting for psi_rtpoll_work to acquire + * rtpoll_trigger_lock + */ + if (task_to_destroy) { + /* + * After the RCU grace period has expired, the worker + * can no longer be found through group->rtpoll_task. + */ + kthread_stop(task_to_destroy); + atomic_set(&group->rtpoll_scheduled, 0); + } + kfree(t); +} + +__poll_t psi_trigger_poll(void **trigger_ptr, + struct file *file, poll_table *wait) +{ + __poll_t ret = DEFAULT_POLLMASK; + struct psi_trigger *t; + + if (static_branch_likely(&psi_disabled)) + return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + + t = smp_load_acquire(trigger_ptr); + if (!t) + return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + + if (t->of) + kernfs_generic_poll(t->of, wait); + else + poll_wait(file, &t->event_wait, wait); + + if (cmpxchg(&t->event, 1, 0) == 1) + ret |= EPOLLPRI; + + return ret; +} + +#ifdef CONFIG_PROC_FS static int psi_io_show(struct seq_file *m, void *v) { return psi_show(m, &psi_system, PSI_IO); @@ -740,33 +1534,149 @@ static int psi_cpu_open(struct inode *inode, struct file *file) return single_open(file, psi_cpu_show, NULL); } -static const struct file_operations psi_io_fops = { - .open = psi_io_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, +static ssize_t psi_write(struct file *file, const char __user *user_buf, + size_t nbytes, enum psi_res res) +{ + char buf[32]; + size_t buf_size; + struct seq_file *seq; + struct psi_trigger *new; + + if (static_branch_likely(&psi_disabled)) + return -EOPNOTSUPP; + + if (!nbytes) + return -EINVAL; + + buf_size = min(nbytes, sizeof(buf)); + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size - 1] = '\0'; + + seq = file->private_data; + + /* Take seq->lock to protect seq->private from concurrent writes */ + mutex_lock(&seq->lock); + + /* Allow only one trigger per file descriptor */ + if (seq->private) { + mutex_unlock(&seq->lock); + return -EBUSY; + } + + new = psi_trigger_create(&psi_system, buf, res, file, NULL); + if (IS_ERR(new)) { + mutex_unlock(&seq->lock); + return PTR_ERR(new); + } + + smp_store_release(&seq->private, new); + mutex_unlock(&seq->lock); + + return nbytes; +} + +static ssize_t psi_io_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_IO); +} + +static ssize_t psi_memory_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_MEM); +} + +static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_CPU); +} + +static __poll_t psi_fop_poll(struct file *file, poll_table *wait) +{ + struct seq_file *seq = file->private_data; + + return psi_trigger_poll(&seq->private, file, wait); +} + +static int psi_fop_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + + psi_trigger_destroy(seq->private); + return single_release(inode, file); +} + +static const struct proc_ops psi_io_proc_ops = { + .proc_open = psi_io_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = psi_io_write, + .proc_poll = psi_fop_poll, + .proc_release = psi_fop_release, }; -static const struct file_operations psi_memory_fops = { - .open = psi_memory_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, +static const struct proc_ops psi_memory_proc_ops = { + .proc_open = psi_memory_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = psi_memory_write, + .proc_poll = psi_fop_poll, + .proc_release = psi_fop_release, }; -static const struct file_operations psi_cpu_fops = { - .open = psi_cpu_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, +static const struct proc_ops psi_cpu_proc_ops = { + .proc_open = psi_cpu_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = psi_cpu_write, + .proc_poll = psi_fop_poll, + .proc_release = psi_fop_release, }; +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static int psi_irq_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IRQ); +} + +static int psi_irq_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_irq_show, NULL); +} + +static ssize_t psi_irq_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_IRQ); +} + +static const struct proc_ops psi_irq_proc_ops = { + .proc_open = psi_irq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = psi_irq_write, + .proc_poll = psi_fop_poll, + .proc_release = psi_fop_release, +}; +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + static int __init psi_proc_init(void) { - proc_mkdir("pressure", NULL); - proc_create("pressure/io", 0, NULL, &psi_io_fops); - proc_create("pressure/memory", 0, NULL, &psi_memory_fops); - proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops); + if (psi_enable) { + proc_mkdir("pressure", NULL); + proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops); + proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops); + proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops); +#endif + } return 0; } module_init(psi_proc_init); + +#endif /* CONFIG_PROC_FS */ |
