diff options
Diffstat (limited to 'kernel/sched/psi.c')
| -rw-r--r-- | kernel/sched/psi.c | 1128 |
1 files changed, 704 insertions, 424 deletions
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index a679613a7cb7..59fdb7ebbf22 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -41,7 +41,7 @@ * What it means for a task to be productive is defined differently * for each resource. For IO, productive means a running task. For * memory, productive means a running task that isn't a reclaimer. For - * CPU, productive means an oncpu task. + * CPU, productive means an on-CPU task. * * Naturally, the FULL state doesn't exist for the CPU resource at the * system level, but exist at the cgroup level. At the cgroup level, @@ -49,7 +49,7 @@ * resource which is being used by others outside of the cgroup or * throttled by the cgroup cpu.max configuration. * - * The percentage of wallclock time spent in those compound stall + * The percentage of wall clock time spent in those compound stall * states gives pressure numbers between 0 and 100 for each resource, * where the SOME percentage indicates workload slowdowns and the FULL * percentage indicates reduced CPU utilization: @@ -136,26 +136,15 @@ * cost-wise, yet way more sensitive and accurate than periodic * sampling of the aggregate task states would be. */ - -#include "../workqueue_internal.h" -#include <linux/sched/loadavg.h> -#include <linux/seq_file.h> -#include <linux/proc_fs.h> -#include <linux/seqlock.h> -#include <linux/uaccess.h> -#include <linux/cgroup.h> -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/ctype.h> -#include <linux/file.h> -#include <linux/poll.h> +#include <linux/sched/clock.h> +#include <linux/workqueue.h> #include <linux/psi.h> #include "sched.h" static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); -DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled); +static DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled); #ifdef CONFIG_PSI_DEFAULT_DISABLED static bool psi_enable; @@ -175,7 +164,6 @@ __setup("psi=", setup_psi); #define EXP_300s 2034 /* 1/exp(2s/300s) */ /* PSI trigger definitions */ -#define WINDOW_MIN_US 500000 /* Min window size is 500ms */ #define WINDOW_MAX_US 10000000 /* Max window size is 10s */ #define UPDATES_PER_WINDOW 10 /* 10 updates per window */ @@ -188,38 +176,60 @@ struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; +static DEFINE_PER_CPU(seqcount_t, psi_seq) = SEQCNT_ZERO(psi_seq); + +static inline void psi_write_begin(int cpu) +{ + write_seqcount_begin(per_cpu_ptr(&psi_seq, cpu)); +} + +static inline void psi_write_end(int cpu) +{ + write_seqcount_end(per_cpu_ptr(&psi_seq, cpu)); +} + +static inline u32 psi_read_begin(int cpu) +{ + return read_seqcount_begin(per_cpu_ptr(&psi_seq, cpu)); +} + +static inline bool psi_read_retry(int cpu, u32 seq) +{ + return read_seqcount_retry(per_cpu_ptr(&psi_seq, cpu), seq); +} + static void psi_avgs_work(struct work_struct *work); static void poll_timer_fn(struct timer_list *t); static void group_init(struct psi_group *group) { - int cpu; - - for_each_possible_cpu(cpu) - seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); + group->enabled = true; group->avg_last_update = sched_clock(); group->avg_next_update = group->avg_last_update + psi_period; - INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); mutex_init(&group->avgs_lock); - /* Init trigger-related members */ - mutex_init(&group->trigger_lock); - INIT_LIST_HEAD(&group->triggers); - memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); - group->poll_states = 0; - group->poll_min_period = U32_MAX; - memset(group->polling_total, 0, sizeof(group->polling_total)); - group->polling_next_update = ULLONG_MAX; - group->polling_until = 0; - init_waitqueue_head(&group->poll_wait); - timer_setup(&group->poll_timer, poll_timer_fn, 0); - rcu_assign_pointer(group->poll_task, NULL); + + /* Init avg trigger-related members */ + INIT_LIST_HEAD(&group->avg_triggers); + memset(group->avg_nr_triggers, 0, sizeof(group->avg_nr_triggers)); + INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); + + /* Init rtpoll trigger-related members */ + atomic_set(&group->rtpoll_scheduled, 0); + mutex_init(&group->rtpoll_trigger_lock); + INIT_LIST_HEAD(&group->rtpoll_triggers); + group->rtpoll_min_period = U32_MAX; + group->rtpoll_next_update = ULLONG_MAX; + init_waitqueue_head(&group->rtpoll_wait); + timer_setup(&group->rtpoll_timer, poll_timer_fn, 0); + rcu_assign_pointer(group->rtpoll_task, NULL); } void __init psi_init(void) { if (!psi_enable) { static_branch_enable(&psi_disabled); + static_branch_disable(&psi_cgroups_enabled); return; } @@ -230,28 +240,32 @@ void __init psi_init(void) group_init(&psi_system); } -static bool test_state(unsigned int *tasks, enum psi_states state) +static u32 test_states(unsigned int *tasks, u32 state_mask) { - switch (state) { - case PSI_IO_SOME: - return unlikely(tasks[NR_IOWAIT]); - case PSI_IO_FULL: - return unlikely(tasks[NR_IOWAIT] && !tasks[NR_RUNNING]); - case PSI_MEM_SOME: - return unlikely(tasks[NR_MEMSTALL]); - case PSI_MEM_FULL: - return unlikely(tasks[NR_MEMSTALL] && - tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); - case PSI_CPU_SOME: - return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]); - case PSI_CPU_FULL: - return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]); - case PSI_NONIDLE: - return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || - tasks[NR_RUNNING]; - default: - return false; + const bool oncpu = state_mask & PSI_ONCPU; + + if (tasks[NR_IOWAIT]) { + state_mask |= BIT(PSI_IO_SOME); + if (!tasks[NR_RUNNING]) + state_mask |= BIT(PSI_IO_FULL); + } + + if (tasks[NR_MEMSTALL]) { + state_mask |= BIT(PSI_MEM_SOME); + if (tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]) + state_mask |= BIT(PSI_MEM_FULL); } + + if (tasks[NR_RUNNING] > oncpu) + state_mask |= BIT(PSI_CPU_SOME); + + if (tasks[NR_RUNNING] && !oncpu) + state_mask |= BIT(PSI_CPU_FULL); + + if (tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]) + state_mask |= BIT(PSI_NONIDLE); + + return state_mask; } static void get_recent_times(struct psi_group *group, int cpu, @@ -259,6 +273,8 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *pchanged_states) { struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + int current_cpu = raw_smp_processor_id(); + unsigned int tasks[NR_PSI_TASK_COUNTS]; u64 now, state_start; enum psi_states s; unsigned int seq; @@ -268,12 +284,14 @@ static void get_recent_times(struct psi_group *group, int cpu, /* Snapshot a coherent view of the CPU state */ do { - seq = read_seqcount_begin(&groupc->seq); + seq = psi_read_begin(cpu); now = cpu_clock(cpu); memcpy(times, groupc->times, sizeof(groupc->times)); state_mask = groupc->state_mask; state_start = groupc->state_start; - } while (read_seqcount_retry(&groupc->seq, seq)); + if (cpu == current_cpu) + memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); + } while (psi_read_retry(cpu, seq)); /* Calculate state time deltas against the previous snapshot */ for (s = 0; s < NR_PSI_STATES; s++) { @@ -297,6 +315,28 @@ static void get_recent_times(struct psi_group *group, int cpu, if (delta) *pchanged_states |= (1 << s); } + + /* + * When collect_percpu_times() from the avgs_work, we don't want to + * re-arm avgs_work when all CPUs are IDLE. But the current CPU running + * this avgs_work is never IDLE, cause avgs_work can't be shut off. + * So for the current CPU, we need to re-arm avgs_work only when + * (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0), for other CPUs + * we can just check PSI_NONIDLE delta. + */ + if (current_work() == &group->avgs_work.work) { + bool reschedule; + + if (cpu == current_cpu) + reschedule = tasks[NR_RUNNING] + + tasks[NR_IOWAIT] + + tasks[NR_MEMSTALL] > 1; + else + reschedule = *pchanged_states & (1 << PSI_NONIDLE); + + if (reschedule) + *pchanged_states |= PSI_STATE_RESCHEDULE; + } } static void calc_avgs(unsigned long avg[3], int missed_periods, @@ -331,7 +371,7 @@ static void collect_percpu_times(struct psi_group *group, /* * Collect the per-cpu time buckets and average them into a - * single time sample that is normalized to wallclock time. + * single time sample that is normalized to wall clock time. * * For averaging, each CPU is weighted by its non-idle time in * the sampling period. This eliminates artifacts from uneven @@ -374,6 +414,114 @@ static void collect_percpu_times(struct psi_group *group, *pchanged_states = changed_states; } +/* Trigger tracking window manipulations */ +static void window_reset(struct psi_window *win, u64 now, u64 value, + u64 prev_growth) +{ + win->start_time = now; + win->start_value = value; + win->prev_growth = prev_growth; +} + +/* + * PSI growth tracking window update and growth calculation routine. + * + * This approximates a sliding tracking window by interpolating + * partially elapsed windows using historical growth data from the + * previous intervals. This minimizes memory requirements (by not storing + * all the intermediate values in the previous window) and simplifies + * the calculations. It works well because PSI signal changes only in + * positive direction and over relatively small window sizes the growth + * is close to linear. + */ +static u64 window_update(struct psi_window *win, u64 now, u64 value) +{ + u64 elapsed; + u64 growth; + + elapsed = now - win->start_time; + growth = value - win->start_value; + /* + * After each tracking window passes win->start_value and + * win->start_time get reset and win->prev_growth stores + * the average per-window growth of the previous window. + * win->prev_growth is then used to interpolate additional + * growth from the previous window assuming it was linear. + */ + if (elapsed > win->size) + window_reset(win, now, value, growth); + else { + u32 remaining; + + remaining = win->size - elapsed; + growth += div64_u64(win->prev_growth * remaining, win->size); + } + + return growth; +} + +static void update_triggers(struct psi_group *group, u64 now, + enum psi_aggregators aggregator) +{ + struct psi_trigger *t; + u64 *total = group->total[aggregator]; + struct list_head *triggers; + u64 *aggregator_total; + + if (aggregator == PSI_AVGS) { + triggers = &group->avg_triggers; + aggregator_total = group->avg_total; + } else { + triggers = &group->rtpoll_triggers; + aggregator_total = group->rtpoll_total; + } + + /* + * On subsequent updates, calculate growth deltas and let + * watchers know when their specified thresholds are exceeded. + */ + list_for_each_entry(t, triggers, node) { + u64 growth; + bool new_stall; + + new_stall = aggregator_total[t->state] != total[t->state]; + + /* Check for stall activity or a previous threshold breach */ + if (!new_stall && !t->pending_event) + continue; + /* + * Check for new stall activity, as well as deferred + * events that occurred in the last window after the + * trigger had already fired (we want to ratelimit + * events without dropping any). + */ + if (new_stall) { + /* Calculate growth since last update */ + growth = window_update(&t->win, now, total[t->state]); + if (!t->pending_event) { + if (growth < t->threshold) + continue; + + t->pending_event = true; + } + } + /* Limit event signaling to once per window */ + if (now < t->last_event_time + t->win.size) + continue; + + /* Generate an event */ + if (cmpxchg(&t->event, 0, 1) == 0) { + if (t->of) + kernfs_notify(t->of->kn); + else + wake_up_interruptible(&t->event_wait); + } + t->last_event_time = now; + /* Reset threshold breach flag once event got generated */ + t->pending_event = false; + } +} + static u64 update_averages(struct psi_group *group, u64 now) { unsigned long missed_periods = 0; @@ -432,7 +580,6 @@ static void psi_avgs_work(struct work_struct *work) struct delayed_work *dwork; struct psi_group *group; u32 changed_states; - bool nonidle; u64 now; dwork = to_delayed_work(work); @@ -443,7 +590,6 @@ static void psi_avgs_work(struct work_struct *work) now = sched_clock(); collect_percpu_times(group, PSI_AVGS, &changed_states); - nonidle = changed_states & (1 << PSI_NONIDLE); /* * If there is task activity, periodically fold the per-cpu * times and feed samples into the running averages. If things @@ -451,10 +597,12 @@ static void psi_avgs_work(struct work_struct *work) * Once restarted, we'll catch up the running averages in one * go - see calc_avgs() and missed_periods. */ - if (now >= group->avg_next_update) + if (now >= group->avg_next_update) { + update_triggers(group, now, PSI_AVGS); group->avg_next_update = update_averages(group, now); + } - if (nonidle) { + if (changed_states & PSI_STATE_RESCHEDULE) { schedule_delayed_work(dwork, nsecs_to_jiffies( group->avg_next_update - now) + 1); } @@ -462,202 +610,155 @@ static void psi_avgs_work(struct work_struct *work) mutex_unlock(&group->avgs_lock); } -/* Trigger tracking window manipulations */ -static void window_reset(struct psi_window *win, u64 now, u64 value, - u64 prev_growth) -{ - win->start_time = now; - win->start_value = value; - win->prev_growth = prev_growth; -} - -/* - * PSI growth tracking window update and growth calculation routine. - * - * This approximates a sliding tracking window by interpolating - * partially elapsed windows using historical growth data from the - * previous intervals. This minimizes memory requirements (by not storing - * all the intermediate values in the previous window) and simplifies - * the calculations. It works well because PSI signal changes only in - * positive direction and over relatively small window sizes the growth - * is close to linear. - */ -static u64 window_update(struct psi_window *win, u64 now, u64 value) -{ - u64 elapsed; - u64 growth; - - elapsed = now - win->start_time; - growth = value - win->start_value; - /* - * After each tracking window passes win->start_value and - * win->start_time get reset and win->prev_growth stores - * the average per-window growth of the previous window. - * win->prev_growth is then used to interpolate additional - * growth from the previous window assuming it was linear. - */ - if (elapsed > win->size) - window_reset(win, now, value, growth); - else { - u32 remaining; - - remaining = win->size - elapsed; - growth += div64_u64(win->prev_growth * remaining, win->size); - } - - return growth; -} - -static void init_triggers(struct psi_group *group, u64 now) +static void init_rtpoll_triggers(struct psi_group *group, u64 now) { struct psi_trigger *t; - list_for_each_entry(t, &group->triggers, node) + list_for_each_entry(t, &group->rtpoll_triggers, node) window_reset(&t->win, now, group->total[PSI_POLL][t->state], 0); - memcpy(group->polling_total, group->total[PSI_POLL], - sizeof(group->polling_total)); - group->polling_next_update = now + group->poll_min_period; + memcpy(group->rtpoll_total, group->total[PSI_POLL], + sizeof(group->rtpoll_total)); + group->rtpoll_next_update = now + group->rtpoll_min_period; } -static u64 update_triggers(struct psi_group *group, u64 now) -{ - struct psi_trigger *t; - bool new_stall = false; - u64 *total = group->total[PSI_POLL]; - - /* - * On subsequent updates, calculate growth deltas and let - * watchers know when their specified thresholds are exceeded. - */ - list_for_each_entry(t, &group->triggers, node) { - u64 growth; - - /* Check for stall activity */ - if (group->polling_total[t->state] == total[t->state]) - continue; - - /* - * Multiple triggers might be looking at the same state, - * remember to update group->polling_total[] once we've - * been through all of them. Also remember to extend the - * polling time if we see new stall activity. - */ - new_stall = true; - - /* Calculate growth since last update */ - growth = window_update(&t->win, now, total[t->state]); - if (growth < t->threshold) - continue; - - /* Limit event signaling to once per window */ - if (now < t->last_event_time + t->win.size) - continue; - - /* Generate an event */ - if (cmpxchg(&t->event, 0, 1) == 0) - wake_up_interruptible(&t->event_wait); - t->last_event_time = now; - } - - if (new_stall) - memcpy(group->polling_total, total, - sizeof(group->polling_total)); - - return now + group->poll_min_period; -} - -/* Schedule polling if it's not already scheduled. */ -static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) +/* Schedule rtpolling if it's not already scheduled or forced. */ +static void psi_schedule_rtpoll_work(struct psi_group *group, unsigned long delay, + bool force) { struct task_struct *task; /* - * Do not reschedule if already scheduled. - * Possible race with a timer scheduled after this check but before - * mod_timer below can be tolerated because group->polling_next_update - * will keep updates on schedule. + * atomic_xchg should be called even when !force to provide a + * full memory barrier (see the comment inside psi_rtpoll_work). */ - if (timer_pending(&group->poll_timer)) + if (atomic_xchg(&group->rtpoll_scheduled, 1) && !force) return; rcu_read_lock(); - task = rcu_dereference(group->poll_task); + task = rcu_dereference(group->rtpoll_task); /* * kworker might be NULL in case psi_trigger_destroy races with * psi_task_change (hotpath) which can't use locks */ if (likely(task)) - mod_timer(&group->poll_timer, jiffies + delay); + mod_timer(&group->rtpoll_timer, jiffies + delay); + else + atomic_set(&group->rtpoll_scheduled, 0); rcu_read_unlock(); } -static void psi_poll_work(struct psi_group *group) +static void psi_rtpoll_work(struct psi_group *group) { + bool force_reschedule = false; u32 changed_states; u64 now; - mutex_lock(&group->trigger_lock); + mutex_lock(&group->rtpoll_trigger_lock); now = sched_clock(); + if (now > group->rtpoll_until) { + /* + * We are either about to start or might stop rtpolling if no + * state change was recorded. Resetting rtpoll_scheduled leaves + * a small window for psi_group_change to sneak in and schedule + * an immediate rtpoll_work before we get to rescheduling. One + * potential extra wakeup at the end of the rtpolling window + * should be negligible and rtpoll_next_update still keeps + * updates correctly on schedule. + */ + atomic_set(&group->rtpoll_scheduled, 0); + /* + * A task change can race with the rtpoll worker that is supposed to + * report on it. To avoid missing events, ensure ordering between + * rtpoll_scheduled and the task state accesses, such that if the + * rtpoll worker misses the state update, the task change is + * guaranteed to reschedule the rtpoll worker: + * + * rtpoll worker: + * atomic_set(rtpoll_scheduled, 0) + * smp_mb() + * LOAD states + * + * task change: + * STORE states + * if atomic_xchg(rtpoll_scheduled, 1) == 0: + * schedule rtpoll worker + * + * The atomic_xchg() implies a full barrier. + */ + smp_mb(); + } else { + /* The rtpolling window is not over, keep rescheduling */ + force_reschedule = true; + } + + collect_percpu_times(group, PSI_POLL, &changed_states); - if (changed_states & group->poll_states) { - /* Initialize trigger windows when entering polling mode */ - if (now > group->polling_until) - init_triggers(group, now); + if (changed_states & group->rtpoll_states) { + /* Initialize trigger windows when entering rtpolling mode */ + if (now > group->rtpoll_until) + init_rtpoll_triggers(group, now); /* * Keep the monitor active for at least the duration of the * minimum tracking window as long as monitor states are * changing. */ - group->polling_until = now + - group->poll_min_period * UPDATES_PER_WINDOW; + group->rtpoll_until = now + + group->rtpoll_min_period * UPDATES_PER_WINDOW; } - if (now > group->polling_until) { - group->polling_next_update = ULLONG_MAX; + if (now > group->rtpoll_until) { + group->rtpoll_next_update = ULLONG_MAX; goto out; } - if (now >= group->polling_next_update) - group->polling_next_update = update_triggers(group, now); + if (now >= group->rtpoll_next_update) { + if (changed_states & group->rtpoll_states) { + update_triggers(group, now, PSI_POLL); + memcpy(group->rtpoll_total, group->total[PSI_POLL], + sizeof(group->rtpoll_total)); + } + group->rtpoll_next_update = now + group->rtpoll_min_period; + } - psi_schedule_poll_work(group, - nsecs_to_jiffies(group->polling_next_update - now) + 1); + psi_schedule_rtpoll_work(group, + nsecs_to_jiffies(group->rtpoll_next_update - now) + 1, + force_reschedule); out: - mutex_unlock(&group->trigger_lock); + mutex_unlock(&group->rtpoll_trigger_lock); } -static int psi_poll_worker(void *data) +static int psi_rtpoll_worker(void *data) { struct psi_group *group = (struct psi_group *)data; sched_set_fifo_low(current); while (true) { - wait_event_interruptible(group->poll_wait, - atomic_cmpxchg(&group->poll_wakeup, 1, 0) || + wait_event_interruptible(group->rtpoll_wait, + atomic_cmpxchg(&group->rtpoll_wakeup, 1, 0) || kthread_should_stop()); if (kthread_should_stop()) break; - psi_poll_work(group); + psi_rtpoll_work(group); } return 0; } static void poll_timer_fn(struct timer_list *t) { - struct psi_group *group = from_timer(group, t, poll_timer); + struct psi_group *group = timer_container_of(group, t, rtpoll_timer); - atomic_set(&group->poll_wakeup, 1); - wake_up_interruptible(&group->poll_wait); + atomic_set(&group->rtpoll_wakeup, 1); + wake_up_interruptible(&group->rtpoll_wait); } static void record_times(struct psi_group_cpu *groupc, u64 now) @@ -689,40 +790,50 @@ static void record_times(struct psi_group_cpu *groupc, u64 now) groupc->times[PSI_NONIDLE] += delta; } +#define for_each_group(iter, group) \ + for (typeof(group) iter = group; iter; iter = iter->parent) + static void psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set, u64 now, - bool wake_clock) + unsigned int clear, unsigned int set, + u64 now, bool wake_clock) { struct psi_group_cpu *groupc; - u32 state_mask = 0; unsigned int t, m; - enum psi_states s; + u32 state_mask; + lockdep_assert_rq_held(cpu_rq(cpu)); groupc = per_cpu_ptr(group->pcpu, cpu); /* - * First we assess the aggregate resource states this CPU's - * tasks have been in since the last change, and account any - * SOME and FULL time these may have resulted in. - * - * Then we update the task counts according to the state - * change requested through the @clear and @set bits. + * Start with TSK_ONCPU, which doesn't have a corresponding + * task count - it's just a boolean flag directly encoded in + * the state mask. Clear, set, or carry the current state if + * no changes are requested. */ - write_seqcount_begin(&groupc->seq); - - record_times(groupc, now); + if (unlikely(clear & TSK_ONCPU)) { + state_mask = 0; + clear &= ~TSK_ONCPU; + } else if (unlikely(set & TSK_ONCPU)) { + state_mask = PSI_ONCPU; + set &= ~TSK_ONCPU; + } else { + state_mask = groupc->state_mask & PSI_ONCPU; + } + /* + * The rest of the state mask is calculated based on the task + * counts. Update those first, then construct the mask. + */ for (t = 0, m = clear; m; m &= ~(1 << t), t++) { if (!(m & (1 << t))) continue; if (groupc->tasks[t]) { groupc->tasks[t]--; } else if (!psi_bug) { - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n", + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", cpu, t, groupc->tasks[0], groupc->tasks[1], groupc->tasks[2], - groupc->tasks[3], groupc->tasks[4], - clear, set); + groupc->tasks[3], clear, set); psi_bug = 1; } } @@ -731,12 +842,24 @@ static void psi_group_change(struct psi_group *group, int cpu, if (set & (1 << t)) groupc->tasks[t]++; - /* Calculate state mask representing active states */ - for (s = 0; s < NR_PSI_STATES; s++) { - if (test_state(groupc->tasks, s)) - state_mask |= (1 << s); + if (!group->enabled) { + /* + * On the first group change after disabling PSI, conclude + * the current state and flush its time. This is unlikely + * to matter to the user, but aggregation (get_recent_times) + * may have already incorporated the live state into times_prev; + * avoid a delta sample underflow when PSI is later re-enabled. + */ + if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE))) + record_times(groupc, now); + + groupc->state_mask = state_mask; + + return; } + state_mask = test_states(groupc->tasks, state_mask); + /* * Since we care about lost potential, a memstall is FULL * when there are no other working tasks, but also when @@ -745,41 +868,26 @@ static void psi_group_change(struct psi_group *group, int cpu, * task in a cgroup is in_memstall, the corresponding groupc * on that cpu is in PSI_MEM_FULL state. */ - if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)) + if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall)) state_mask |= (1 << PSI_MEM_FULL); - groupc->state_mask = state_mask; + record_times(groupc, now); - write_seqcount_end(&groupc->seq); + groupc->state_mask = state_mask; - if (state_mask & group->poll_states) - psi_schedule_poll_work(group, 1); + if (state_mask & group->rtpoll_states) + psi_schedule_rtpoll_work(group, 1, false); if (wake_clock && !delayed_work_pending(&group->avgs_work)) schedule_delayed_work(&group->avgs_work, PSI_FREQ); } -static struct psi_group *iterate_groups(struct task_struct *task, void **iter) +static inline struct psi_group *task_psi_group(struct task_struct *task) { - if (*iter == &psi_system) - return NULL; - #ifdef CONFIG_CGROUPS - if (static_branch_likely(&psi_cgroups_enabled)) { - struct cgroup *cgroup = NULL; - - if (!*iter) - cgroup = task->cgroups->dfl_cgrp; - else - cgroup = cgroup_parent(*iter); - - if (cgroup && cgroup_parent(cgroup)) { - *iter = cgroup; - return cgroup_psi(cgroup); - } - } + if (static_branch_likely(&psi_cgroups_enabled)) + return cgroup_psi(task_dfl_cgroup(task)); #endif - *iter = &psi_system; return &psi_system; } @@ -801,9 +909,6 @@ static void psi_flags_change(struct task_struct *task, int clear, int set) void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); - struct psi_group *group; - bool wake_clock = true; - void *iter = NULL; u64 now; if (!task->pid) @@ -811,55 +916,44 @@ void psi_task_change(struct task_struct *task, int clear, int set) psi_flags_change(task, clear, set); + psi_write_begin(cpu); now = cpu_clock(cpu); - /* - * Periodic aggregation shuts off if there is a period of no - * task changes, so we wake it back up if necessary. However, - * don't do this if the task change is the aggregation worker - * itself going to sleep, or we'll ping-pong forever. - */ - if (unlikely((clear & TSK_RUNNING) && - (task->flags & PF_WQ_WORKER) && - wq_worker_last_func(task) == psi_avgs_work)) - wake_clock = false; - - while ((group = iterate_groups(task, &iter))) - psi_group_change(group, cpu, clear, set, now, wake_clock); + for_each_group(group, task_psi_group(task)) + psi_group_change(group, cpu, clear, set, now, true); + psi_write_end(cpu); } void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep) { - struct psi_group *group, *common = NULL; + struct psi_group *common = NULL; int cpu = task_cpu(prev); - void *iter; - u64 now = cpu_clock(cpu); + u64 now; - if (next->pid) { - bool identical_state; + psi_write_begin(cpu); + now = cpu_clock(cpu); + if (next->pid) { psi_flags_change(next, 0, TSK_ONCPU); /* - * When switching between tasks that have an identical - * runtime state, the cgroup that contains both tasks - * we reach the first common ancestor. Iterate @next's - * ancestors only until we encounter @prev's ONCPU. + * Set TSK_ONCPU on @next's cgroups. If @next shares any + * ancestors with @prev, those will already have @prev's + * TSK_ONCPU bit set, and we can stop the iteration there. */ - identical_state = prev->psi_flags == next->psi_flags; - iter = NULL; - while ((group = iterate_groups(next, &iter))) { - if (identical_state && - per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { + for_each_group(group, task_psi_group(next)) { + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + + if (groupc->state_mask & PSI_ONCPU) { common = group; break; } - psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); } } if (prev->pid) { int clear = TSK_ONCPU, set = 0; + bool wake_clock = true; /* * When we're going to sleep, psi_dequeue() lets us @@ -873,25 +967,84 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, clear |= TSK_MEMSTALL_RUNNING; if (prev->in_iowait) set |= TSK_IOWAIT; + + /* + * Periodic aggregation shuts off if there is a period of no + * task changes, so we wake it back up if necessary. However, + * don't do this if the task change is the aggregation worker + * itself going to sleep, or we'll ping-pong forever. + */ + if (unlikely((prev->flags & PF_WQ_WORKER) && + wq_worker_last_func(prev) == psi_avgs_work)) + wake_clock = false; } psi_flags_change(prev, clear, set); - iter = NULL; - while ((group = iterate_groups(prev, &iter)) && group != common) - psi_group_change(group, cpu, clear, set, now, true); + for_each_group(group, task_psi_group(prev)) { + if (group == common) + break; + psi_group_change(group, cpu, clear, set, now, wake_clock); + } /* - * TSK_ONCPU is handled up to the common ancestor. If we're tasked - * with dequeuing too, finish that for the rest of the hierarchy. + * TSK_ONCPU is handled up to the common ancestor. If there are + * any other differences between the two tasks (e.g. prev goes + * to sleep, or only one task is memstall), finish propagating + * those differences all the way up to the root. */ - if (sleep) { + if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { clear &= ~TSK_ONCPU; - for (; group; group = iterate_groups(prev, &iter)) - psi_group_change(group, cpu, clear, set, now, true); + for_each_group(group, common) + psi_group_change(group, cpu, clear, set, now, wake_clock); } } + psi_write_end(cpu); +} + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) +{ + int cpu = task_cpu(curr); + struct psi_group_cpu *groupc; + s64 delta; + u64 irq; + u64 now; + + if (static_branch_likely(&psi_disabled) || !irqtime_enabled()) + return; + + if (!curr->pid) + return; + + lockdep_assert_rq_held(rq); + if (prev && task_psi_group(prev) == task_psi_group(curr)) + return; + + irq = irq_time_read(cpu); + delta = (s64)(irq - rq->psi_irq_time); + if (delta < 0) + return; + rq->psi_irq_time = irq; + + psi_write_begin(cpu); + now = cpu_clock(cpu); + + for_each_group(group, task_psi_group(curr)) { + if (!group->enabled) + continue; + + groupc = per_cpu_ptr(group->pcpu, cpu); + + record_times(groupc, now); + groupc->times[PSI_IRQ_FULL] += delta; + + if (group->rtpoll_states & (1 << PSI_IRQ_FULL)) + psi_schedule_rtpoll_work(group, 1, false); + } + psi_write_end(cpu); } +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /** * psi_memstall_enter - mark the beginning of a memory stall section @@ -923,6 +1076,7 @@ void psi_memstall_enter(unsigned long *flags) rq_unlock_irq(rq, &rf); } +EXPORT_SYMBOL_GPL(psi_memstall_enter); /** * psi_memstall_leave - mark the end of an memory stall section @@ -952,29 +1106,38 @@ void psi_memstall_leave(unsigned long *flags) rq_unlock_irq(rq, &rf); } +EXPORT_SYMBOL_GPL(psi_memstall_leave); #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgroup) { - if (static_branch_likely(&psi_disabled)) + if (!static_branch_likely(&psi_cgroups_enabled)) return 0; - cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); - if (!cgroup->psi.pcpu) + cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL); + if (!cgroup->psi) return -ENOMEM; - group_init(&cgroup->psi); + + cgroup->psi->pcpu = alloc_percpu(struct psi_group_cpu); + if (!cgroup->psi->pcpu) { + kfree(cgroup->psi); + return -ENOMEM; + } + group_init(cgroup->psi); + cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup)); return 0; } void psi_cgroup_free(struct cgroup *cgroup) { - if (static_branch_likely(&psi_disabled)) + if (!static_branch_likely(&psi_cgroups_enabled)) return; - cancel_delayed_work_sync(&cgroup->psi.avgs_work); - free_percpu(cgroup->psi.pcpu); + cancel_delayed_work_sync(&cgroup->psi->avgs_work); + free_percpu(cgroup->psi->pcpu); /* All triggers must be removed by now */ - WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n"); + WARN_ONCE(cgroup->psi->rtpoll_states, "psi: trigger leak\n"); + kfree(cgroup->psi); } /** @@ -995,7 +1158,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) struct rq_flags rf; struct rq *rq; - if (static_branch_likely(&psi_disabled)) { + if (!static_branch_likely(&psi_cgroups_enabled)) { /* * Lame to do this here, but the scheduler cannot be locked * from the outside, so we move cgroups from inside sched/. @@ -1043,16 +1206,56 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) task_rq_unlock(rq, task, &rf); } + +void psi_cgroup_restart(struct psi_group *group) +{ + int cpu; + + /* + * After we disable psi_group->enabled, we don't actually + * stop percpu tasks accounting in each psi_group_cpu, + * instead only stop test_states() loop, record_times() + * and averaging worker, see psi_group_change() for details. + * + * When disable cgroup PSI, this function has nothing to sync + * since cgroup pressure files are hidden and percpu psi_group_cpu + * would see !psi_group->enabled and only do task accounting. + * + * When re-enable cgroup PSI, this function use psi_group_change() + * to get correct state mask from test_states() loop on tasks[], + * and restart groupc->state_start from now, use .clear = .set = 0 + * here since no task status really changed. + */ + if (!group->enabled) + return; + + for_each_possible_cpu(cpu) { + u64 now; + + guard(rq_lock_irq)(cpu_rq(cpu)); + + psi_write_begin(cpu); + now = cpu_clock(cpu); + psi_group_change(group, cpu, 0, 0, now, true); + psi_write_end(cpu); + } +} #endif /* CONFIG_CGROUPS */ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { + bool only_full = false; int full; u64 now; if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + if (!irqtime_enabled() && res == PSI_IRQ) + return -EOPNOTSUPP; +#endif + /* Update averages before reporting them */ mutex_lock(&group->avgs_lock); now = sched_clock(); @@ -1061,18 +1264,25 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) group->avg_next_update = update_averages(group, now); mutex_unlock(&group->avgs_lock); - for (full = 0; full < 2; full++) { - unsigned long avg[3]; - u64 total; +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + only_full = res == PSI_IRQ; +#endif + + for (full = 0; full < 2 - only_full; full++) { + unsigned long avg[3] = { 0, }; + u64 total = 0; int w; - for (w = 0; w < 3; w++) - avg[w] = group->avg[res * 2 + full][w]; - total = div_u64(group->total[PSI_AVGS][res * 2 + full], - NSEC_PER_USEC); + /* CPU FULL is undefined at the system level */ + if (!(group == &psi_system && res == PSI_CPU && full)) { + for (w = 0; w < 3; w++) + avg[w] = group->avg[res * 2 + full][w]; + total = div_u64(group->total[PSI_AVGS][res * 2 + full], + NSEC_PER_USEC); + } seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", - full ? "full" : "some", + full || only_full ? "full" : "some", LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), @@ -1082,55 +1292,25 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) return 0; } -static int psi_io_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_IO); -} - -static int psi_memory_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_MEM); -} - -static int psi_cpu_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_CPU); -} - -static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *)) -{ - if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) - return -EPERM; - - return single_open(file, psi_show, NULL); -} - -static int psi_io_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_io_show); -} - -static int psi_memory_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_memory_show); -} - -static int psi_cpu_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_cpu_show); -} - -struct psi_trigger *psi_trigger_create(struct psi_group *group, - char *buf, size_t nbytes, enum psi_res res) +struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, + enum psi_res res, struct file *file, + struct kernfs_open_file *of) { struct psi_trigger *t; enum psi_states state; u32 threshold_us; + bool privileged; u32 window_us; if (static_branch_likely(&psi_disabled)) return ERR_PTR(-EOPNOTSUPP); + /* + * Checking the privilege here on file->f_cred implies that a privileged user + * could open the file and delegate the write to an unprivileged one. + */ + privileged = cap_raised(file->f_cred->cap_effective, CAP_SYS_RESOURCE); + if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2) state = PSI_IO_SOME + res * 2; else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2) @@ -1138,11 +1318,22 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, else return ERR_PTR(-EINVAL); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + if (res == PSI_IRQ && --state != PSI_IRQ_FULL) + return ERR_PTR(-EINVAL); +#endif + if (state >= PSI_NONIDLE) return ERR_PTR(-EINVAL); - if (window_us < WINDOW_MIN_US || - window_us > WINDOW_MAX_US) + if (window_us == 0 || window_us > WINDOW_MAX_US) + return ERR_PTR(-EINVAL); + + /* + * Unprivileged users can only use 2s windows so that averages aggregation + * work is used, and no RT threads need to be spawned. + */ + if (!privileged && window_us % 2000000) return ERR_PTR(-EINVAL); /* Check threshold */ @@ -1157,115 +1348,137 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->state = state; t->threshold = threshold_us * NSEC_PER_USEC; t->win.size = window_us * NSEC_PER_USEC; - window_reset(&t->win, 0, 0, 0); + window_reset(&t->win, sched_clock(), + group->total[PSI_POLL][t->state], 0); t->event = 0; t->last_event_time = 0; - init_waitqueue_head(&t->event_wait); - kref_init(&t->refcount); - - mutex_lock(&group->trigger_lock); - - if (!rcu_access_pointer(group->poll_task)) { - struct task_struct *task; - - task = kthread_create(psi_poll_worker, group, "psimon"); - if (IS_ERR(task)) { - kfree(t); - mutex_unlock(&group->trigger_lock); - return ERR_CAST(task); + t->of = of; + if (!of) + init_waitqueue_head(&t->event_wait); + t->pending_event = false; + t->aggregator = privileged ? PSI_POLL : PSI_AVGS; + + if (privileged) { + mutex_lock(&group->rtpoll_trigger_lock); + + if (!rcu_access_pointer(group->rtpoll_task)) { + struct task_struct *task; + + task = kthread_create(psi_rtpoll_worker, group, "psimon"); + if (IS_ERR(task)) { + kfree(t); + mutex_unlock(&group->rtpoll_trigger_lock); + return ERR_CAST(task); + } + atomic_set(&group->rtpoll_wakeup, 0); + wake_up_process(task); + rcu_assign_pointer(group->rtpoll_task, task); } - atomic_set(&group->poll_wakeup, 0); - wake_up_process(task); - rcu_assign_pointer(group->poll_task, task); - } - list_add(&t->node, &group->triggers); - group->poll_min_period = min(group->poll_min_period, - div_u64(t->win.size, UPDATES_PER_WINDOW)); - group->nr_triggers[t->state]++; - group->poll_states |= (1 << t->state); + list_add(&t->node, &group->rtpoll_triggers); + group->rtpoll_min_period = min(group->rtpoll_min_period, + div_u64(t->win.size, UPDATES_PER_WINDOW)); + group->rtpoll_nr_triggers[t->state]++; + group->rtpoll_states |= (1 << t->state); - mutex_unlock(&group->trigger_lock); + mutex_unlock(&group->rtpoll_trigger_lock); + } else { + mutex_lock(&group->avgs_lock); + list_add(&t->node, &group->avg_triggers); + group->avg_nr_triggers[t->state]++; + + mutex_unlock(&group->avgs_lock); + } return t; } -static void psi_trigger_destroy(struct kref *ref) +void psi_trigger_destroy(struct psi_trigger *t) { - struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); - struct psi_group *group = t->group; + struct psi_group *group; struct task_struct *task_to_destroy = NULL; - if (static_branch_likely(&psi_disabled)) + /* + * We do not check psi_disabled since it might have been disabled after + * the trigger got created. + */ + if (!t) return; + group = t->group; /* - * Wakeup waiters to stop polling. Can happen if cgroup is deleted - * from under a polling process. + * Wakeup waiters to stop polling and clear the queue to prevent it from + * being accessed later. Can happen if cgroup is deleted from under a + * polling process. */ - wake_up_interruptible(&t->event_wait); - - mutex_lock(&group->trigger_lock); - - if (!list_empty(&t->node)) { - struct psi_trigger *tmp; - u64 period = ULLONG_MAX; - - list_del(&t->node); - group->nr_triggers[t->state]--; - if (!group->nr_triggers[t->state]) - group->poll_states &= ~(1 << t->state); - /* reset min update period for the remaining triggers */ - list_for_each_entry(tmp, &group->triggers, node) - period = min(period, div_u64(tmp->win.size, - UPDATES_PER_WINDOW)); - group->poll_min_period = period; - /* Destroy poll_task when the last trigger is destroyed */ - if (group->poll_states == 0) { - group->polling_until = 0; - task_to_destroy = rcu_dereference_protected( - group->poll_task, - lockdep_is_held(&group->trigger_lock)); - rcu_assign_pointer(group->poll_task, NULL); - del_timer(&group->poll_timer); + if (t->of) + kernfs_notify(t->of->kn); + else + wake_up_interruptible(&t->event_wait); + + if (t->aggregator == PSI_AVGS) { + mutex_lock(&group->avgs_lock); + if (!list_empty(&t->node)) { + list_del(&t->node); + group->avg_nr_triggers[t->state]--; } + mutex_unlock(&group->avgs_lock); + } else { + mutex_lock(&group->rtpoll_trigger_lock); + if (!list_empty(&t->node)) { + struct psi_trigger *tmp; + u64 period = ULLONG_MAX; + + list_del(&t->node); + group->rtpoll_nr_triggers[t->state]--; + if (!group->rtpoll_nr_triggers[t->state]) + group->rtpoll_states &= ~(1 << t->state); + /* + * Reset min update period for the remaining triggers + * iff the destroying trigger had the min window size. + */ + if (group->rtpoll_min_period == div_u64(t->win.size, UPDATES_PER_WINDOW)) { + list_for_each_entry(tmp, &group->rtpoll_triggers, node) + period = min(period, div_u64(tmp->win.size, + UPDATES_PER_WINDOW)); + group->rtpoll_min_period = period; + } + /* Destroy rtpoll_task when the last trigger is destroyed */ + if (group->rtpoll_states == 0) { + group->rtpoll_until = 0; + task_to_destroy = rcu_dereference_protected( + group->rtpoll_task, + lockdep_is_held(&group->rtpoll_trigger_lock)); + rcu_assign_pointer(group->rtpoll_task, NULL); + timer_delete(&group->rtpoll_timer); + } + } + mutex_unlock(&group->rtpoll_trigger_lock); } - mutex_unlock(&group->trigger_lock); - /* - * Wait for both *trigger_ptr from psi_trigger_replace and - * poll_task RCUs to complete their read-side critical sections - * before destroying the trigger and optionally the poll_task + * Wait for psi_schedule_rtpoll_work RCU to complete its read-side + * critical section before destroying the trigger and optionally the + * rtpoll_task. */ synchronize_rcu(); /* - * Stop kthread 'psimon' after releasing trigger_lock to prevent a - * deadlock while waiting for psi_poll_work to acquire trigger_lock + * Stop kthread 'psimon' after releasing rtpoll_trigger_lock to prevent + * a deadlock while waiting for psi_rtpoll_work to acquire + * rtpoll_trigger_lock */ if (task_to_destroy) { /* * After the RCU grace period has expired, the worker - * can no longer be found through group->poll_task. + * can no longer be found through group->rtpoll_task. */ kthread_stop(task_to_destroy); + atomic_set(&group->rtpoll_scheduled, 0); } kfree(t); } -void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) -{ - struct psi_trigger *old = *trigger_ptr; - - if (static_branch_likely(&psi_disabled)) - return; - - rcu_assign_pointer(*trigger_ptr, new); - if (old) - kref_put(&old->refcount, psi_trigger_destroy); -} - __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait) { @@ -1275,27 +1488,52 @@ __poll_t psi_trigger_poll(void **trigger_ptr, if (static_branch_likely(&psi_disabled)) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - rcu_read_lock(); - - t = rcu_dereference(*(void __rcu __force **)trigger_ptr); - if (!t) { - rcu_read_unlock(); + t = smp_load_acquire(trigger_ptr); + if (!t) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - } - kref_get(&t->refcount); - - rcu_read_unlock(); - poll_wait(file, &t->event_wait, wait); + if (t->of) + kernfs_generic_poll(t->of, wait); + else + poll_wait(file, &t->event_wait, wait); if (cmpxchg(&t->event, 1, 0) == 1) ret |= EPOLLPRI; - kref_put(&t->refcount, psi_trigger_destroy); - return ret; } +#ifdef CONFIG_PROC_FS +static int psi_io_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IO); +} + +static int psi_memory_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_MEM); +} + +static int psi_cpu_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_CPU); +} + +static int psi_io_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_io_show, NULL); +} + +static int psi_memory_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_memory_show, NULL); +} + +static int psi_cpu_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_cpu_show, NULL); +} + static ssize_t psi_write(struct file *file, const char __user *user_buf, size_t nbytes, enum psi_res res) { @@ -1316,14 +1554,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, buf[buf_size - 1] = '\0'; - new = psi_trigger_create(&psi_system, buf, nbytes, res); - if (IS_ERR(new)) - return PTR_ERR(new); - seq = file->private_data; + /* Take seq->lock to protect seq->private from concurrent writes */ mutex_lock(&seq->lock); - psi_trigger_replace(&seq->private, new); + + /* Allow only one trigger per file descriptor */ + if (seq->private) { + mutex_unlock(&seq->lock); + return -EBUSY; + } + + new = psi_trigger_create(&psi_system, buf, res, file, NULL); + if (IS_ERR(new)) { + mutex_unlock(&seq->lock); + return PTR_ERR(new); + } + + smp_store_release(&seq->private, new); mutex_unlock(&seq->lock); return nbytes; @@ -1358,7 +1606,7 @@ static int psi_fop_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; - psi_trigger_replace(&seq->private, NULL); + psi_trigger_destroy(seq->private); return single_release(inode, file); } @@ -1389,6 +1637,33 @@ static const struct proc_ops psi_cpu_proc_ops = { .proc_release = psi_fop_release, }; +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static int psi_irq_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IRQ); +} + +static int psi_irq_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_irq_show, NULL); +} + +static ssize_t psi_irq_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_IRQ); +} + +static const struct proc_ops psi_irq_proc_ops = { + .proc_open = psi_irq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = psi_irq_write, + .proc_poll = psi_fop_poll, + .proc_release = psi_fop_release, +}; +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + static int __init psi_proc_init(void) { if (psi_enable) { @@ -1396,7 +1671,12 @@ static int __init psi_proc_init(void) proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops); proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops); proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops); +#endif } return 0; } module_init(psi_proc_init); + +#endif /* CONFIG_PROC_FS */ |
