From acb2ec3dd003b50b6fb5772057a08ec0dc45d42a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 14 May 2019 15:40:43 -0700 Subject: kernel/Makefile: don't assume that kernel/gen_ikh_data.sh is executable If the user downloads and applies patch-5.1.gz using patch(1), the x bit on kernel/gen_ikh_data.sh is not set. /bin/sh: 1: ./kernel/gen_ikh_data.sh: Permission denied Fix this by using CONFIG_SHELL. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 298437bb2c6a..33824f0385b3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -127,7 +127,7 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz -cmd_genikh = $(srctree)/kernel/gen_ikh_data.sh $@ +cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_ikh_data.sh $@ $(obj)/kheaders_data.tar.xz: FORCE $(call cmd,genikh) -- cgit From c3f3ce049f7d97cc7ec9c01cb51d9ec74e0f37c2 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 14 May 2019 15:40:46 -0700 Subject: userfaultfd: use RCU to free the task struct when fork fails The task structure is freed while get_mem_cgroup_from_mm() holds rcu_read_lock() and dereferences mm->owner. get_mem_cgroup_from_mm() failing fork() ---- --- task = mm->owner mm->owner = NULL; free(task) if (task) *task; /* use after free */ The fix consists in freeing the task with RCU also in the fork failure case, exactly like it always happens for the regular exit(2) path. That is enough to make the rcu_read_lock hold in get_mem_cgroup_from_mm() (left side above) effective to avoid a use after free when dereferencing the task structure. An alternate possible fix would be to defer the delivery of the userfaultfd contexts to the monitor until after fork() is guaranteed to succeed. Such a change would require more changes because it would create a strict ordering dependency where the uffd methods would need to be called beyond the last potentially failing branch in order to be safe. This solution as opposed only adds the dependency to common code to set mm->owner to NULL and to free the task struct that was pointed by mm->owner with RCU, if fork ends up failing. The userfaultfd methods can still be called anywhere during the fork runtime and the monitor will keep discarding orphaned "mm" coming from failed forks in userland. This race condition couldn't trigger if CONFIG_MEMCG was set =n at build time. [aarcange@redhat.com: improve changelog, reduce #ifdefs per Michal] Link: http://lkml.kernel.org/r/20190429035752.4508-1-aarcange@redhat.com Link: http://lkml.kernel.org/r/20190325225636.11635-2-aarcange@redhat.com Fixes: 893e26e61d04 ("userfaultfd: non-cooperative: Add fork() event") Signed-off-by: Andrea Arcangeli Tested-by: zhong jiang Reported-by: syzbot+cbb52e396df3e565ab02@syzkaller.appspotmail.com Cc: Oleg Nesterov Cc: Jann Horn Cc: Hugh Dickins Cc: Mike Rapoport Cc: Mike Kravetz Cc: Peter Xu Cc: Jason Gunthorpe Cc: "Kirill A . Shutemov" Cc: Michal Hocko Cc: zhong jiang Cc: syzbot+cbb52e396df3e565ab02@syzkaller.appspotmail.com Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 737db1828437..b409e792aadc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -955,6 +955,15 @@ static void mm_init_aio(struct mm_struct *mm) #endif } +static __always_inline void mm_clear_owner(struct mm_struct *mm, + struct task_struct *p) +{ +#ifdef CONFIG_MEMCG + if (mm->owner == p) + WRITE_ONCE(mm->owner, NULL); +#endif +} + static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) { #ifdef CONFIG_MEMCG @@ -1343,6 +1352,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk, free_pt: /* don't put binfmt in mmput, we haven't got module yet */ mm->binfmt = NULL; + mm_init_owner(mm, NULL); mmput(mm); fail_nomem: @@ -1726,6 +1736,21 @@ static int pidfd_create(struct pid *pid) return fd; } +static void __delayed_free_task(struct rcu_head *rhp) +{ + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + + free_task(tsk); +} + +static __always_inline void delayed_free_task(struct task_struct *tsk) +{ + if (IS_ENABLED(CONFIG_MEMCG)) + call_rcu(&tsk->rcu, __delayed_free_task); + else + free_task(tsk); +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -2233,8 +2258,10 @@ bad_fork_cleanup_io: bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: - if (p->mm) + if (p->mm) { + mm_clear_owner(p->mm, p); mmput(p->mm); + } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); @@ -2265,7 +2292,7 @@ bad_fork_cleanup_count: bad_fork_free: p->state = TASK_DEAD; put_task_stack(p); - free_task(p); + delayed_free_task(p); fork_out: spin_lock_irq(¤t->sighand->siglock); hlist_del_init(&delayed.node); -- cgit From 987717e5e016a0dd3011d3bb16546672713f94e2 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 14 May 2019 15:40:50 -0700 Subject: mm: change mm_update_next_owner() to update mm->owner with WRITE_ONCE The RCU reader uses rcu_dereference() inside rcu_read_lock critical sections, so the writer shall use WRITE_ONCE. Just a cleanup, we still rely on gcc to emit atomic writes in other places. Link: http://lkml.kernel.org/r/20190325225636.11635-3-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Reviewed-by: Andrew Morton Cc: Hugh Dickins Cc: Jann Horn Cc: Jason Gunthorpe Cc: "Kirill A . Shutemov" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Oleg Nesterov Cc: Peter Xu Cc: zhong jiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 2166c2d92ddc..8361a560cd1d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -422,7 +422,7 @@ retry: * freed task structure. */ if (atomic_read(&mm->mm_users) <= 1) { - mm->owner = NULL; + WRITE_ONCE(mm->owner, NULL); return; } @@ -462,7 +462,7 @@ retry: * most likely racing with swapoff (try_to_unuse()) or /proc or * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ - mm->owner = NULL; + WRITE_ONCE(mm->owner, NULL); return; assign_new_owner: @@ -483,7 +483,7 @@ assign_new_owner: put_task_struct(c); goto retry; } - mm->owner = c; + WRITE_ONCE(mm->owner, c); task_unlock(c); put_task_struct(c); } -- cgit From 136ac591f047fc356072343eb85687f7c6ea191d Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Tue, 14 May 2019 15:40:53 -0700 Subject: mm: update references to page _refcount Commit 0139aa7b7fa ("mm: rename _count, field of the struct page, to _refcount") left out a couple of references to the old field name. Fix that. Link: http://lkml.kernel.org/r/cedf87b02eb8a6b3eac57e8e91da53fb15c3c44c.1556537475.git.baruch@tkos.co.il Fixes: 0139aa7b7fa ("mm: rename _count, field of the struct page, to _refcount") Signed-off-by: Baruch Siach Reviewed-by: Andrew Morton Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug.c | 2 +- mm/page_alloc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/debug.c b/mm/debug.c index eee9c221280c..8345bb6e4769 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -67,7 +67,7 @@ void __dump_page(struct page *page, const char *reason) */ mapcount = PageSlab(page) ? 0 : page_mapcount(page); - pr_warn("page:%px count:%d mapcount:%d mapping:%px index:%#lx", + pr_warn("page:%px refcount:%d mapcount:%d mapping:%px index:%#lx", page, page_ref_count(page), mapcount, page->mapping, page_to_pgoff(page)); if (PageCompound(page)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f2f3fb4921d1..4b2d5f50431d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1986,7 +1986,7 @@ static void check_new_page_bad(struct page *page) if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; if (unlikely(page_ref_count(page) != 0)) - bad_reason = "nonzero _count"; + bad_reason = "nonzero _refcount"; if (unlikely(page->flags & __PG_HWPOISON)) { bad_reason = "HWPoisoned (hardware-corrupted)"; bad_flags = __PG_HWPOISON; -- cgit From 33b2d6302abc4ccea1d9b3f095e2e27b02ca264e Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:40:56 -0700 Subject: psi: introduce state_mask to represent stalled psi states Patch series "psi: pressure stall monitors", v6. This is a respin of: https://lwn.net/ml/linux-kernel/20190308184311.144521-1-surenb%40google.com/ Android is adopting psi to detect and remedy memory pressure that results in stuttering and decreased responsiveness on mobile devices. Psi gives us the stall information, but because we're dealing with latencies in the millisecond range, periodically reading the pressure files to detect stalls in a timely fashion is not feasible. Psi also doesn't aggregate its averages at a high-enough frequency right now. This patch series extends the psi interface such that users can configure sensitive latency thresholds and use poll() and friends to be notified when these are breached. As high-frequency aggregation is costly, it implements an aggregation method that is optimized for fast, short-interval averaging, and makes the aggregation frequency adaptive, such that high-frequency updates only happen while monitored stall events are actively occurring. With these patches applied, Android can monitor for, and ward off, mounting memory shortages before they cause problems for the user. For example, using memory stall monitors in userspace low memory killer daemon (lmkd) we can detect mounting pressure and kill less important processes before device becomes visibly sluggish. In our memory stress testing psi memory monitors produce roughly 10x less false positives compared to vmpressure signals. Having ability to specify multiple triggers for the same psi metric allows other parts of Android framework to monitor memory state of the device and act accordingly. The new interface is straight-forward. The user opens one of the pressure files for writing and writes a trigger description into the file descriptor that defines the stall state - some or full, and the maximum stall time over a given window of time. E.g.: /* Signal when stall time exceeds 100ms of a 1s window */ char trigger[] = "full 100000 1000000" fd = open("/proc/pressure/memory") write(fd, trigger, sizeof(trigger)) while (poll() >= 0) { ... }; close(fd); When the monitored stall state is entered, psi adapts its aggregation frequency according to what the configured time window requires in order to emit event signals in a timely fashion. Once the stalling subsides, aggregation reverts back to normal. The trigger is associated with the open file descriptor. To stop monitoring, the user only needs to close the file descriptor and the trigger is discarded. Patches 1-6 prepare the psi code for polling support. Patch 7 implements the adaptive polling logic, the pressure growth detection optimized for short intervals, and hooks up write() and poll() on the pressure files. The patches were developed in collaboration with Johannes Weiner. This patch (of 7): The psi monitoring patches will need to determine the same states as record_times(). To avoid calculating them twice, maintain a state mask that can be consulted cheaply. Do this in a separate patch to keep the churn in the main feature patch at a minimum. This adds 4-byte state_mask member into psi_group_cpu struct which results in its first cacheline-aligned part becoming 52 bytes long. Add explicit values to enumeration element counters that affect psi_group_cpu struct size. Link: http://lkml.kernel.org/r/20190124211518.244221-4-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Johannes Weiner Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/psi_types.h | 9 ++++++--- kernel/sched/psi.c | 29 +++++++++++++++++++---------- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 2cf422db5d18..762c6bb16f3c 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -11,7 +11,7 @@ enum psi_task_count { NR_IOWAIT, NR_MEMSTALL, NR_RUNNING, - NR_PSI_TASK_COUNTS, + NR_PSI_TASK_COUNTS = 3, }; /* Task state bitmasks */ @@ -24,7 +24,7 @@ enum psi_res { PSI_IO, PSI_MEM, PSI_CPU, - NR_PSI_RESOURCES, + NR_PSI_RESOURCES = 3, }; /* @@ -41,7 +41,7 @@ enum psi_states { PSI_CPU_SOME, /* Only per-CPU, to weigh the CPU in the global average: */ PSI_NONIDLE, - NR_PSI_STATES, + NR_PSI_STATES = 6, }; struct psi_group_cpu { @@ -53,6 +53,9 @@ struct psi_group_cpu { /* States of the tasks belonging to this group */ unsigned int tasks[NR_PSI_TASK_COUNTS]; + /* Aggregate pressure state derived from the tasks */ + u32 state_mask; + /* Period time sampling buckets for each state of interest (ns) */ u32 times[NR_PSI_STATES]; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 0e97ca9306ef..22c1505ad290 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -213,17 +213,17 @@ static bool test_state(unsigned int *tasks, enum psi_states state) static void get_recent_times(struct psi_group *group, int cpu, u32 *times) { struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); - unsigned int tasks[NR_PSI_TASK_COUNTS]; u64 now, state_start; + enum psi_states s; unsigned int seq; - int s; + u32 state_mask; /* Snapshot a coherent view of the CPU state */ do { seq = read_seqcount_begin(&groupc->seq); now = cpu_clock(cpu); memcpy(times, groupc->times, sizeof(groupc->times)); - memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); + state_mask = groupc->state_mask; state_start = groupc->state_start; } while (read_seqcount_retry(&groupc->seq, seq)); @@ -239,7 +239,7 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times) * (u32) and our reported pressure close to what's * actually happening. */ - if (test_state(tasks, s)) + if (state_mask & (1 << s)) times[s] += now - state_start; delta = times[s] - groupc->times_prev[s]; @@ -407,15 +407,15 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, delta = now - groupc->state_start; groupc->state_start = now; - if (test_state(groupc->tasks, PSI_IO_SOME)) { + if (groupc->state_mask & (1 << PSI_IO_SOME)) { groupc->times[PSI_IO_SOME] += delta; - if (test_state(groupc->tasks, PSI_IO_FULL)) + if (groupc->state_mask & (1 << PSI_IO_FULL)) groupc->times[PSI_IO_FULL] += delta; } - if (test_state(groupc->tasks, PSI_MEM_SOME)) { + if (groupc->state_mask & (1 << PSI_MEM_SOME)) { groupc->times[PSI_MEM_SOME] += delta; - if (test_state(groupc->tasks, PSI_MEM_FULL)) + if (groupc->state_mask & (1 << PSI_MEM_FULL)) groupc->times[PSI_MEM_FULL] += delta; else if (memstall_tick) { u32 sample; @@ -436,10 +436,10 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, } } - if (test_state(groupc->tasks, PSI_CPU_SOME)) + if (groupc->state_mask & (1 << PSI_CPU_SOME)) groupc->times[PSI_CPU_SOME] += delta; - if (test_state(groupc->tasks, PSI_NONIDLE)) + if (groupc->state_mask & (1 << PSI_NONIDLE)) groupc->times[PSI_NONIDLE] += delta; } @@ -448,6 +448,8 @@ static void psi_group_change(struct psi_group *group, int cpu, { struct psi_group_cpu *groupc; unsigned int t, m; + enum psi_states s; + u32 state_mask = 0; groupc = per_cpu_ptr(group->pcpu, cpu); @@ -480,6 +482,13 @@ static void psi_group_change(struct psi_group *group, int cpu, if (set & (1 << t)) groupc->tasks[t]++; + /* Calculate state mask representing active states */ + for (s = 0; s < NR_PSI_STATES; s++) { + if (test_state(groupc->tasks, s)) + state_mask |= (1 << s); + } + groupc->state_mask = state_mask; + write_seqcount_end(&groupc->seq); } -- cgit From 9289c5e6a78a5a9397df5fa60eb82b105abcfecf Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:40:59 -0700 Subject: psi: make psi_enable static psi_enable is not used outside of psi.c, make it static. Link: http://lkml.kernel.org/r/20190319235619.260832-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Andrew Morton Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/psi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 22c1505ad290..281702de9772 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -140,9 +140,9 @@ static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); #ifdef CONFIG_PSI_DEFAULT_DISABLED -bool psi_enable; +static bool psi_enable; #else -bool psi_enable = true; +static bool psi_enable = true; #endif static int __init setup_psi(char *str) { -- cgit From bcc78db64168eb6dede056fed2999f75f7ace309 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:41:02 -0700 Subject: psi: rename psi fields in preparation for psi trigger addition Rename psi_group structure member fields used for calculating psi totals and averages for clear distinction between them and for trigger-related fields that will be added by "psi: introduce psi monitor". [surenb@google.com: v6] Link: http://lkml.kernel.org/r/20190319235619.260832-4-surenb@google.com Link: http://lkml.kernel.org/r/20190124211518.244221-5-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Johannes Weiner Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/psi_types.h | 14 +++++++------- kernel/sched/psi.c | 41 +++++++++++++++++++++-------------------- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 762c6bb16f3c..4d1c1f67be18 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -69,17 +69,17 @@ struct psi_group_cpu { }; struct psi_group { - /* Protects data updated during an aggregation */ - struct mutex stat_lock; + /* Protects data used by the aggregator */ + struct mutex avgs_lock; /* Per-cpu task state & time tracking */ struct psi_group_cpu __percpu *pcpu; - /* Periodic aggregation state */ - u64 total_prev[NR_PSI_STATES - 1]; - u64 last_update; - u64 next_update; - struct delayed_work clock_work; + /* Running pressure averages */ + u64 avg_total[NR_PSI_STATES - 1]; + u64 avg_last_update; + u64 avg_next_update; + struct delayed_work avgs_work; /* Total stall times and sampled pressure averages */ u64 total[NR_PSI_STATES - 1]; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 281702de9772..4fb4d9913bc8 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -165,7 +165,7 @@ static struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; -static void psi_update_work(struct work_struct *work); +static void psi_avgs_work(struct work_struct *work); static void group_init(struct psi_group *group) { @@ -173,9 +173,9 @@ static void group_init(struct psi_group *group) for_each_possible_cpu(cpu) seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); - group->next_update = sched_clock() + psi_period; - INIT_DELAYED_WORK(&group->clock_work, psi_update_work); - mutex_init(&group->stat_lock); + group->avg_next_update = sched_clock() + psi_period; + INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); + mutex_init(&group->avgs_lock); } void __init psi_init(void) @@ -278,7 +278,7 @@ static bool update_stats(struct psi_group *group) int cpu; int s; - mutex_lock(&group->stat_lock); + mutex_lock(&group->avgs_lock); /* * Collect the per-cpu time buckets and average them into a @@ -319,7 +319,7 @@ static bool update_stats(struct psi_group *group) /* avgX= */ now = sched_clock(); - expires = group->next_update; + expires = group->avg_next_update; if (now < expires) goto out; if (now - expires >= psi_period) @@ -332,14 +332,14 @@ static bool update_stats(struct psi_group *group) * But the deltas we sample out of the per-cpu buckets above * are based on the actual time elapsing between clock ticks. */ - group->next_update = expires + ((1 + missed_periods) * psi_period); - period = now - (group->last_update + (missed_periods * psi_period)); - group->last_update = now; + group->avg_next_update = expires + ((1 + missed_periods) * psi_period); + period = now - (group->avg_last_update + (missed_periods * psi_period)); + group->avg_last_update = now; for (s = 0; s < NR_PSI_STATES - 1; s++) { u32 sample; - sample = group->total[s] - group->total_prev[s]; + sample = group->total[s] - group->avg_total[s]; /* * Due to the lockless sampling of the time buckets, * recorded time deltas can slip into the next period, @@ -359,22 +359,22 @@ static bool update_stats(struct psi_group *group) */ if (sample > period) sample = period; - group->total_prev[s] += sample; + group->avg_total[s] += sample; calc_avgs(group->avg[s], missed_periods, sample, period); } out: - mutex_unlock(&group->stat_lock); + mutex_unlock(&group->avgs_lock); return nonidle_total; } -static void psi_update_work(struct work_struct *work) +static void psi_avgs_work(struct work_struct *work) { struct delayed_work *dwork; struct psi_group *group; bool nonidle; dwork = to_delayed_work(work); - group = container_of(dwork, struct psi_group, clock_work); + group = container_of(dwork, struct psi_group, avgs_work); /* * If there is task activity, periodically fold the per-cpu @@ -391,8 +391,9 @@ static void psi_update_work(struct work_struct *work) u64 now; now = sched_clock(); - if (group->next_update > now) - delay = nsecs_to_jiffies(group->next_update - now) + 1; + if (group->avg_next_update > now) + delay = nsecs_to_jiffies( + group->avg_next_update - now) + 1; schedule_delayed_work(dwork, delay); } } @@ -546,13 +547,13 @@ void psi_task_change(struct task_struct *task, int clear, int set) */ if (unlikely((clear & TSK_RUNNING) && (task->flags & PF_WQ_WORKER) && - wq_worker_last_func(task) == psi_update_work)) + wq_worker_last_func(task) == psi_avgs_work)) wake_clock = false; while ((group = iterate_groups(task, &iter))) { psi_group_change(group, cpu, clear, set); - if (wake_clock && !delayed_work_pending(&group->clock_work)) - schedule_delayed_work(&group->clock_work, PSI_FREQ); + if (wake_clock && !delayed_work_pending(&group->avgs_work)) + schedule_delayed_work(&group->avgs_work, PSI_FREQ); } } @@ -649,7 +650,7 @@ void psi_cgroup_free(struct cgroup *cgroup) if (static_branch_likely(&psi_disabled)) return; - cancel_delayed_work_sync(&cgroup->psi.clock_work); + cancel_delayed_work_sync(&cgroup->psi.avgs_work); free_percpu(cgroup->psi.pcpu); } -- cgit From 7fc70a3999366560ad1d4f2389a78360300c2c6a Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:41:06 -0700 Subject: psi: split update_stats into parts Split update_stats into collect_percpu_times and update_averages for collect_percpu_times to be reused later inside psi monitor. Link: http://lkml.kernel.org/r/20190319235619.260832-5-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Johannes Weiner Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/psi.c | 57 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 4fb4d9913bc8..ace5ed97b186 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -269,17 +269,13 @@ static void calc_avgs(unsigned long avg[3], int missed_periods, avg[2] = calc_load(avg[2], EXP_300s, pct); } -static bool update_stats(struct psi_group *group) +static bool collect_percpu_times(struct psi_group *group) { u64 deltas[NR_PSI_STATES - 1] = { 0, }; - unsigned long missed_periods = 0; unsigned long nonidle_total = 0; - u64 now, expires, period; int cpu; int s; - mutex_lock(&group->avgs_lock); - /* * Collect the per-cpu time buckets and average them into a * single time sample that is normalized to wallclock time. @@ -317,11 +313,18 @@ static bool update_stats(struct psi_group *group) for (s = 0; s < NR_PSI_STATES - 1; s++) group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); + return nonidle_total; +} + +static u64 update_averages(struct psi_group *group, u64 now) +{ + unsigned long missed_periods = 0; + u64 expires, period; + u64 avg_next_update; + int s; + /* avgX= */ - now = sched_clock(); expires = group->avg_next_update; - if (now < expires) - goto out; if (now - expires >= psi_period) missed_periods = div_u64(now - expires, psi_period); @@ -332,7 +335,7 @@ static bool update_stats(struct psi_group *group) * But the deltas we sample out of the per-cpu buckets above * are based on the actual time elapsing between clock ticks. */ - group->avg_next_update = expires + ((1 + missed_periods) * psi_period); + avg_next_update = expires + ((1 + missed_periods) * psi_period); period = now - (group->avg_last_update + (missed_periods * psi_period)); group->avg_last_update = now; @@ -362,9 +365,8 @@ static bool update_stats(struct psi_group *group) group->avg_total[s] += sample; calc_avgs(group->avg[s], missed_periods, sample, period); } -out: - mutex_unlock(&group->avgs_lock); - return nonidle_total; + + return avg_next_update; } static void psi_avgs_work(struct work_struct *work) @@ -372,10 +374,16 @@ static void psi_avgs_work(struct work_struct *work) struct delayed_work *dwork; struct psi_group *group; bool nonidle; + u64 now; dwork = to_delayed_work(work); group = container_of(dwork, struct psi_group, avgs_work); + mutex_lock(&group->avgs_lock); + + now = sched_clock(); + + nonidle = collect_percpu_times(group); /* * If there is task activity, periodically fold the per-cpu * times and feed samples into the running averages. If things @@ -383,19 +391,15 @@ static void psi_avgs_work(struct work_struct *work) * Once restarted, we'll catch up the running averages in one * go - see calc_avgs() and missed_periods. */ - - nonidle = update_stats(group); + if (now >= group->avg_next_update) + group->avg_next_update = update_averages(group, now); if (nonidle) { - unsigned long delay = 0; - u64 now; - - now = sched_clock(); - if (group->avg_next_update > now) - delay = nsecs_to_jiffies( - group->avg_next_update - now) + 1; - schedule_delayed_work(dwork, delay); + schedule_delayed_work(dwork, nsecs_to_jiffies( + group->avg_next_update - now) + 1); } + + mutex_unlock(&group->avgs_lock); } static void record_times(struct psi_group_cpu *groupc, int cpu, @@ -707,11 +711,18 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { int full; + u64 now; if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; - update_stats(group); + /* Update averages before reporting them */ + mutex_lock(&group->avgs_lock); + now = sched_clock(); + collect_percpu_times(group); + if (now >= group->avg_next_update) + group->avg_next_update = update_averages(group, now); + mutex_unlock(&group->avgs_lock); for (full = 0; full < 2 - (res == PSI_CPU); full++) { unsigned long avg[3]; -- cgit From 333f3017c5a893b000b2b4a3529814ce93fa83d7 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:41:09 -0700 Subject: psi: track changed states Introduce changed_states parameter into collect_percpu_times to track the states changed since the last update. This will be needed to detect whether polled states activated in the monitor patch. Link: http://lkml.kernel.org/r/20190319235619.260832-6-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Johannes Weiner Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/psi.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ace5ed97b186..1b99eeffaa25 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -210,7 +210,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state) } } -static void get_recent_times(struct psi_group *group, int cpu, u32 *times) +static void get_recent_times(struct psi_group *group, int cpu, u32 *times, + u32 *pchanged_states) { struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); u64 now, state_start; @@ -218,6 +219,8 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times) unsigned int seq; u32 state_mask; + *pchanged_states = 0; + /* Snapshot a coherent view of the CPU state */ do { seq = read_seqcount_begin(&groupc->seq); @@ -246,6 +249,8 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times) groupc->times_prev[s] = times[s]; times[s] = delta; + if (delta) + *pchanged_states |= (1 << s); } } @@ -269,10 +274,11 @@ static void calc_avgs(unsigned long avg[3], int missed_periods, avg[2] = calc_load(avg[2], EXP_300s, pct); } -static bool collect_percpu_times(struct psi_group *group) +static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states) { u64 deltas[NR_PSI_STATES - 1] = { 0, }; unsigned long nonidle_total = 0; + u32 changed_states = 0; int cpu; int s; @@ -287,8 +293,11 @@ static bool collect_percpu_times(struct psi_group *group) for_each_possible_cpu(cpu) { u32 times[NR_PSI_STATES]; u32 nonidle; + u32 cpu_changed_states; - get_recent_times(group, cpu, times); + get_recent_times(group, cpu, times, + &cpu_changed_states); + changed_states |= cpu_changed_states; nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]); nonidle_total += nonidle; @@ -313,7 +322,8 @@ static bool collect_percpu_times(struct psi_group *group) for (s = 0; s < NR_PSI_STATES - 1; s++) group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); - return nonidle_total; + if (pchanged_states) + *pchanged_states = changed_states; } static u64 update_averages(struct psi_group *group, u64 now) @@ -373,6 +383,7 @@ static void psi_avgs_work(struct work_struct *work) { struct delayed_work *dwork; struct psi_group *group; + u32 changed_states; bool nonidle; u64 now; @@ -383,7 +394,8 @@ static void psi_avgs_work(struct work_struct *work) now = sched_clock(); - nonidle = collect_percpu_times(group); + collect_percpu_times(group, &changed_states); + nonidle = changed_states & (1 << PSI_NONIDLE); /* * If there is task activity, periodically fold the per-cpu * times and feed samples into the running averages. If things @@ -719,7 +731,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) /* Update averages before reporting them */ mutex_lock(&group->avgs_lock); now = sched_clock(); - collect_percpu_times(group); + collect_percpu_times(group, NULL); if (now >= group->avg_next_update) group->avg_next_update = update_averages(group, now); mutex_unlock(&group->avgs_lock); -- cgit From 8af0c18af1425fc70686c0fdcfc0072cd8431aa0 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:41:12 -0700 Subject: include/: refactor headers to allow kthread.h inclusion in psi_types.h kthread.h can't be included in psi_types.h because it creates a circular inclusion with kthread.h eventually including psi_types.h and complaining on kthread structures not being defined because they are defined further in the kthread.h. Resolve this by removing psi_types.h inclusion from the headers included from kthread.h. Link: http://lkml.kernel.org/r/20190319235619.260832-7-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Johannes Weiner Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/spi/spi-rockchip.c | 1 + include/linux/kthread.h | 3 ++- include/linux/sched.h | 1 - kernel/kthread.c | 1 + 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-rockchip.c b/drivers/spi/spi-rockchip.c index 3912526ead66..cdb613d38062 100644 --- a/drivers/spi/spi-rockchip.c +++ b/drivers/spi/spi-rockchip.c @@ -15,6 +15,7 @@ #include #include +#include #include #include #include diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 2c89e60bc752..0f9da966934e 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -4,7 +4,6 @@ /* Simple interface for creating and stopping kernel threads without mess. */ #include #include -#include __printf(4, 5) struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), @@ -198,6 +197,8 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work); void kthread_destroy_worker(struct kthread_worker *worker); +struct cgroup_subsys_state; + #ifdef CONFIG_BLK_CGROUP void kthread_associate_blkcg(struct cgroup_subsys_state *css); struct cgroup_subsys_state *kthread_blkcg(void); diff --git a/include/linux/sched.h b/include/linux/sched.h index a2cd15855bad..11837410690f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/kthread.c b/kernel/kthread.c index 5942eeafb9ac..be4e8795561a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include -- cgit From 0e94682b73bfa6c44c98af7a26771c9c08c055d5 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:41:15 -0700 Subject: psi: introduce psi monitor Psi monitor aims to provide a low-latency short-term pressure detection mechanism configurable by users. It allows users to monitor psi metrics growth and trigger events whenever a metric raises above user-defined threshold within user-defined time window. Time window and threshold are both expressed in usecs. Multiple psi resources with different thresholds and window sizes can be monitored concurrently. Psi monitors activate when system enters stall state for the monitored psi metric and deactivate upon exit from the stall state. While system is in the stall state psi signal growth is monitored at a rate of 10 times per tracking window. Min window size is 500ms, therefore the min monitoring interval is 50ms. Max window size is 10s with monitoring interval of 1s. When activated psi monitor stays active for at least the duration of one tracking window to avoid repeated activations/deactivations when psi signal is bouncing. Notifications to the users are rate-limited to one per tracking window. Link: http://lkml.kernel.org/r/20190319235619.260832-8-surenb@google.com Signed-off-by: Suren Baghdasaryan Signed-off-by: Johannes Weiner Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/accounting/psi.txt | 107 +++++++++ include/linux/psi.h | 8 + include/linux/psi_types.h | 82 ++++++- kernel/cgroup/cgroup.c | 71 +++++- kernel/sched/psi.c | 494 +++++++++++++++++++++++++++++++++++++-- 5 files changed, 742 insertions(+), 20 deletions(-) diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt index 7e71c9c1d8e9..5cbe5659e3b7 100644 --- a/Documentation/accounting/psi.txt +++ b/Documentation/accounting/psi.txt @@ -63,6 +63,110 @@ as well as medium and long term trends. The total absolute stall time spikes which wouldn't necessarily make a dent in the time averages, or to average trends over custom time frames. +Monitoring for pressure thresholds +================================== + +Users can register triggers and use poll() to be woken up when resource +pressure exceeds certain thresholds. + +A trigger describes the maximum cumulative stall time over a specific +time window, e.g. 100ms of total stall time within any 500ms window to +generate a wakeup event. + +To register a trigger user has to open psi interface file under +/proc/pressure/ representing the resource to be monitored and write the +desired threshold and time window. The open file descriptor should be +used to wait for trigger events using select(), poll() or epoll(). +The following format is used: + +