summaryrefslogtreecommitdiff
path: root/kernel/sched/psi.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/psi.c')
-rw-r--r--kernel/sched/psi.c114
1 files changed, 64 insertions, 50 deletions
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 7b4aa5809c0f..1396674fa722 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -41,7 +41,7 @@
* What it means for a task to be productive is defined differently
* for each resource. For IO, productive means a running task. For
* memory, productive means a running task that isn't a reclaimer. For
- * CPU, productive means an oncpu task.
+ * CPU, productive means an on-CPU task.
*
* Naturally, the FULL state doesn't exist for the CPU resource at the
* system level, but exist at the cgroup level. At the cgroup level,
@@ -49,7 +49,7 @@
* resource which is being used by others outside of the cgroup or
* throttled by the cgroup cpu.max configuration.
*
- * The percentage of wallclock time spent in those compound stall
+ * The percentage of wall clock time spent in those compound stall
* states gives pressure numbers between 0 and 100 for each resource,
* where the SOME percentage indicates workload slowdowns and the FULL
* percentage indicates reduced CPU utilization:
@@ -218,28 +218,32 @@ void __init psi_init(void)
group_init(&psi_system);
}
-static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu)
+static u32 test_states(unsigned int *tasks, u32 state_mask)
{
- switch (state) {
- case PSI_IO_SOME:
- return unlikely(tasks[NR_IOWAIT]);
- case PSI_IO_FULL:
- return unlikely(tasks[NR_IOWAIT] && !tasks[NR_RUNNING]);
- case PSI_MEM_SOME:
- return unlikely(tasks[NR_MEMSTALL]);
- case PSI_MEM_FULL:
- return unlikely(tasks[NR_MEMSTALL] &&
- tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
- case PSI_CPU_SOME:
- return unlikely(tasks[NR_RUNNING] > oncpu);
- case PSI_CPU_FULL:
- return unlikely(tasks[NR_RUNNING] && !oncpu);
- case PSI_NONIDLE:
- return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
- tasks[NR_RUNNING];
- default:
- return false;
+ const bool oncpu = state_mask & PSI_ONCPU;
+
+ if (tasks[NR_IOWAIT]) {
+ state_mask |= BIT(PSI_IO_SOME);
+ if (!tasks[NR_RUNNING])
+ state_mask |= BIT(PSI_IO_FULL);
+ }
+
+ if (tasks[NR_MEMSTALL]) {
+ state_mask |= BIT(PSI_MEM_SOME);
+ if (tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING])
+ state_mask |= BIT(PSI_MEM_FULL);
}
+
+ if (tasks[NR_RUNNING] > oncpu)
+ state_mask |= BIT(PSI_CPU_SOME);
+
+ if (tasks[NR_RUNNING] && !oncpu)
+ state_mask |= BIT(PSI_CPU_FULL);
+
+ if (tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING])
+ state_mask |= BIT(PSI_NONIDLE);
+
+ return state_mask;
}
static void get_recent_times(struct psi_group *group, int cpu,
@@ -345,7 +349,7 @@ static void collect_percpu_times(struct psi_group *group,
/*
* Collect the per-cpu time buckets and average them into a
- * single time sample that is normalized to wallclock time.
+ * single time sample that is normalized to wall clock time.
*
* For averaging, each CPU is weighted by its non-idle time in
* the sampling period. This eliminates artifacts from uneven
@@ -765,14 +769,15 @@ static void record_times(struct psi_group_cpu *groupc, u64 now)
}
static void psi_group_change(struct psi_group *group, int cpu,
- unsigned int clear, unsigned int set, u64 now,
+ unsigned int clear, unsigned int set,
bool wake_clock)
{
struct psi_group_cpu *groupc;
unsigned int t, m;
- enum psi_states s;
u32 state_mask;
+ u64 now;
+ lockdep_assert_rq_held(cpu_rq(cpu));
groupc = per_cpu_ptr(group->pcpu, cpu);
/*
@@ -785,6 +790,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
* SOME and FULL time these may have resulted in.
*/
write_seqcount_begin(&groupc->seq);
+ now = cpu_clock(cpu);
/*
* Start with TSK_ONCPU, which doesn't have a corresponding
@@ -841,10 +847,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
return;
}
- for (s = 0; s < NR_PSI_STATES; s++) {
- if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
- state_mask |= (1 << s);
- }
+ state_mask = test_states(groupc->tasks, state_mask);
/*
* Since we care about lost potential, a memstall is FULL
@@ -898,18 +901,15 @@ void psi_task_change(struct task_struct *task, int clear, int set)
{
int cpu = task_cpu(task);
struct psi_group *group;
- u64 now;
if (!task->pid)
return;
psi_flags_change(task, clear, set);
- now = cpu_clock(cpu);
-
group = task_psi_group(task);
do {
- psi_group_change(group, cpu, clear, set, now, true);
+ psi_group_change(group, cpu, clear, set, true);
} while ((group = group->parent));
}
@@ -918,7 +918,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
{
struct psi_group *group, *common = NULL;
int cpu = task_cpu(prev);
- u64 now = cpu_clock(cpu);
if (next->pid) {
psi_flags_change(next, 0, TSK_ONCPU);
@@ -935,7 +934,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
break;
}
- psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
+ psi_group_change(group, cpu, 0, TSK_ONCPU, true);
} while ((group = group->parent));
}
@@ -973,7 +972,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
do {
if (group == common)
break;
- psi_group_change(group, cpu, clear, set, now, wake_clock);
+ psi_group_change(group, cpu, clear, set, wake_clock);
} while ((group = group->parent));
/*
@@ -985,35 +984,47 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
clear &= ~TSK_ONCPU;
for (; group; group = group->parent)
- psi_group_change(group, cpu, clear, set, now, wake_clock);
+ psi_group_change(group, cpu, clear, set, wake_clock);
}
}
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-void psi_account_irqtime(struct task_struct *task, u32 delta)
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
{
- int cpu = task_cpu(task);
+ int cpu = task_cpu(curr);
struct psi_group *group;
struct psi_group_cpu *groupc;
- u64 now;
+ s64 delta;
+ u64 irq;
- if (static_branch_likely(&psi_disabled))
+ if (static_branch_likely(&psi_disabled) || !irqtime_enabled())
return;
- if (!task->pid)
+ if (!curr->pid)
return;
- now = cpu_clock(cpu);
+ lockdep_assert_rq_held(rq);
+ group = task_psi_group(curr);
+ if (prev && task_psi_group(prev) == group)
+ return;
+
+ irq = irq_time_read(cpu);
+ delta = (s64)(irq - rq->psi_irq_time);
+ if (delta < 0)
+ return;
+ rq->psi_irq_time = irq;
- group = task_psi_group(task);
do {
+ u64 now;
+
if (!group->enabled)
continue;
groupc = per_cpu_ptr(group->pcpu, cpu);
write_seqcount_begin(&groupc->seq);
+ now = cpu_clock(cpu);
record_times(groupc, now);
groupc->times[PSI_IRQ_FULL] += delta;
@@ -1194,7 +1205,7 @@ void psi_cgroup_restart(struct psi_group *group)
/*
* After we disable psi_group->enabled, we don't actually
* stop percpu tasks accounting in each psi_group_cpu,
- * instead only stop test_state() loop, record_times()
+ * instead only stop test_states() loop, record_times()
* and averaging worker, see psi_group_change() for details.
*
* When disable cgroup PSI, this function has nothing to sync
@@ -1202,7 +1213,7 @@ void psi_cgroup_restart(struct psi_group *group)
* would see !psi_group->enabled and only do task accounting.
*
* When re-enable cgroup PSI, this function use psi_group_change()
- * to get correct state mask from test_state() loop on tasks[],
+ * to get correct state mask from test_states() loop on tasks[],
* and restart groupc->state_start from now, use .clear = .set = 0
* here since no task status really changed.
*/
@@ -1212,11 +1223,9 @@ void psi_cgroup_restart(struct psi_group *group)
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
- u64 now;
rq_lock_irq(rq, &rf);
- now = cpu_clock(cpu);
- psi_group_change(group, cpu, 0, 0, now, true);
+ psi_group_change(group, cpu, 0, 0, true);
rq_unlock_irq(rq, &rf);
}
}
@@ -1231,6 +1240,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
if (static_branch_likely(&psi_disabled))
return -EOPNOTSUPP;
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ if (!irqtime_enabled() && res == PSI_IRQ)
+ return -EOPNOTSUPP;
+#endif
+
/* Update averages before reporting them */
mutex_lock(&group->avgs_lock);
now = sched_clock();
@@ -1426,7 +1440,7 @@ void psi_trigger_destroy(struct psi_trigger *t)
group->rtpoll_task,
lockdep_is_held(&group->rtpoll_trigger_lock));
rcu_assign_pointer(group->rtpoll_task, NULL);
- del_timer(&group->rtpoll_timer);
+ timer_delete(&group->rtpoll_timer);
}
}
mutex_unlock(&group->rtpoll_trigger_lock);