From ec8d7c14ea14922fe21945b458a75e39f11dd832 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:57:21 -0700 Subject: mm, oom_reaper: do not mmput synchronously from the oom reaper context Tetsuo has properly noted that mmput slow path might get blocked waiting for another party (e.g. exit_aio waits for an IO). If that happens the oom_reaper would be put out of the way and will not be able to process next oom victim. We should strive for making this context as reliable and independent on other subsystems as much as possible. Introduce mmput_async which will perform the slow path from an async (WQ) context. This will delay the operation but that shouldn't be a problem because the oom_reaper has reclaimed the victim's address space for most cases as much as possible and the remaining context shouldn't bind too much memory anymore. The only exception is when mmap_sem trylock has failed which shouldn't happen too often. The issue is only theoretical but not impossible. Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 50 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 3e8451527cbe..8fbed7194af1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -699,6 +699,26 @@ void __mmdrop(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(__mmdrop); +static inline void __mmput(struct mm_struct *mm) +{ + VM_BUG_ON(atomic_read(&mm->mm_users)); + + uprobe_clear_state(mm); + exit_aio(mm); + ksm_exit(mm); + khugepaged_exit(mm); /* must run before exit_mmap */ + exit_mmap(mm); + set_mm_exe_file(mm, NULL); + if (!list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + list_del(&mm->mmlist); + spin_unlock(&mmlist_lock); + } + if (mm->binfmt) + module_put(mm->binfmt->module); + mmdrop(mm); +} + /* * Decrement the use count and release all resources for an mm. */ @@ -706,24 +726,24 @@ void mmput(struct mm_struct *mm) { might_sleep(); + if (atomic_dec_and_test(&mm->mm_users)) + __mmput(mm); +} +EXPORT_SYMBOL_GPL(mmput); + +static void mmput_async_fn(struct work_struct *work) +{ + struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); + __mmput(mm); +} + +void mmput_async(struct mm_struct *mm) +{ if (atomic_dec_and_test(&mm->mm_users)) { - uprobe_clear_state(mm); - exit_aio(mm); - ksm_exit(mm); - khugepaged_exit(mm); /* must run before exit_mmap */ - exit_mmap(mm); - set_mm_exe_file(mm, NULL); - if (!list_empty(&mm->mmlist)) { - spin_lock(&mmlist_lock); - list_del(&mm->mmlist); - spin_unlock(&mmlist_lock); - } - if (mm->binfmt) - module_put(mm->binfmt->module); - mmdrop(mm); + INIT_WORK(&mm->async_put_work, mmput_async_fn); + schedule_work(&mm->async_put_work); } } -EXPORT_SYMBOL_GPL(mmput); /** * set_mm_exe_file - change a reference to the mm's executable file -- cgit From e64646946ed32902fd597fa6e514b1da84642de3 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 20 May 2016 17:00:20 -0700 Subject: exit_thread: accept a task parameter to be exited We need to call exit_thread from copy_process in a fail path. So make it accept task_struct as a parameter. [v2] * s390: exit_thread_runtime_instr doesn't make sense to be called for non-current tasks. * arm: fix the comment in vfp_thread_copy * change 'me' to 'tsk' for task_struct * now we can change only archs that actually have exit_thread [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Jiri Slaby Cc: "David S. Miller" Cc: "H. Peter Anvin" Cc: "James E.J. Bottomley" Cc: Aurelien Jacquiot Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chen Liqin Cc: Chris Metcalf Cc: Chris Zankel Cc: David Howells Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James Hogan Cc: Jeff Dike Cc: Jesper Nilsson Cc: Jiri Slaby Cc: Jonas Bonn Cc: Koichi Yasutake Cc: Lennox Wu Cc: Ley Foon Tan Cc: Mark Salter Cc: Martin Schwidefsky Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mikael Starvik Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Rich Felker Cc: Richard Henderson Cc: Richard Kuo Cc: Richard Weinberger Cc: Russell King Cc: Steven Miao Cc: Thomas Gleixner Cc: Tony Luck Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index fd90195667e1..75b34fe835b2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -746,7 +746,7 @@ void do_exit(long code) disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); - exit_thread(); + exit_thread(tsk); /* * Flush inherited counters to the parent - before the parent -- cgit From 0740aa5f6375681c57488c4ea55d05a0341cfc9c Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 20 May 2016 17:00:25 -0700 Subject: fork: free thread in copy_process on failure When using this program (as root): #include #include #include #include #include #include #include #define ITER 1000 #define FORKERS 15 #define THREADS (6000/FORKERS) // 1850 is proc max static void fork_100_wait() { unsigned a, to_wait = 0; printf("\t%d forking %d\n", THREADS, getpid()); for (a = 0; a < THREADS; a++) { switch (fork()) { case 0: usleep(1000); exit(0); break; case -1: break; default: to_wait++; break; } } printf("\t%d forked from %d, waiting for %d\n", THREADS, getpid(), to_wait); for (a = 0; a < to_wait; a++) wait(NULL); printf("\t%d waited from %d\n", THREADS, getpid()); } static void run_forkers() { pid_t forkers[FORKERS]; unsigned a; for (a = 0; a < FORKERS; a++) { switch ((forkers[a] = fork())) { case 0: fork_100_wait(); exit(0); break; case -1: err(1, "DIE fork of %d'th forker", a); break; default: break; } } for (a = 0; a < FORKERS; a++) waitpid(forkers[a], NULL, 0); } int main() { unsigned a; int ret; ret = ioperm(10, 20, 0); if (ret < 0) err(1, "ioperm"); for (a = 0; a < ITER; a++) run_forkers(); return 0; } kmemleak reports many occurences of this leak: unreferenced object 0xffff8805917c8000 (size 8192): comm "fork-leak", pid 2932, jiffies 4295354292 (age 1871.028s) hex dump (first 32 bytes): ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ................ ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ................ backtrace: [] kmemdup+0x25/0x50 [] copy_thread_tls+0x6c3/0x9a0 [] copy_process+0x1a84/0x5790 [] wake_up_new_task+0x2d5/0x6f0 [] _do_fork+0x12d/0x820 ... Due to the leakage of the memory items which should have been freed in arch/x86/kernel/process.c:exit_thread(). Make sure the memory is freed when fork fails later in copy_process. This is done by calling exit_thread with the thread to kill. Signed-off-by: Jiri Slaby Cc: "David S. Miller" Cc: "H. Peter Anvin" Cc: "James E.J. Bottomley" Cc: Aurelien Jacquiot Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chen Liqin Cc: Chris Metcalf Cc: Chris Zankel Cc: David Howells Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James Hogan Cc: Jeff Dike Cc: Jesper Nilsson Cc: Jiri Slaby Cc: Jonas Bonn Cc: Koichi Yasutake Cc: Lennox Wu Cc: Ley Foon Tan Cc: Mark Salter Cc: Martin Schwidefsky Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mikael Starvik Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Rich Felker Cc: Richard Henderson Cc: Richard Kuo Cc: Richard Weinberger Cc: Russell King Cc: Steven Miao Cc: Thomas Gleixner Cc: Tony Luck Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8fbed7194af1..103d78fd8f75 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1490,7 +1490,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, pid = alloc_pid(p->nsproxy->pid_ns_for_children); if (IS_ERR(pid)) { retval = PTR_ERR(pid); - goto bad_fork_cleanup_io; + goto bad_fork_cleanup_thread; } } @@ -1652,6 +1652,8 @@ bad_fork_cancel_cgroup: bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); +bad_fork_cleanup_thread: + exit_thread(p); bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); -- cgit From 42a0bb3f71383b457a7db362f1c69e7afb96732b Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 20 May 2016 17:00:33 -0700 Subject: printk/nmi: generic solution for safe printk in NMI printk() takes some locks and could not be used a safe way in NMI context. The chance of a deadlock is real especially when printing stacks from all CPUs. This particular problem has been addressed on x86 by the commit a9edc8809328 ("x86/nmi: Perform a safe NMI stack trace on all CPUs"). The patchset brings two big advantages. First, it makes the NMI backtraces safe on all architectures for free. Second, it makes all NMI messages almost safe on all architectures (the temporary buffer is limited. We still should keep the number of messages in NMI context at minimum). Note that there already are several messages printed in NMI context: WARN_ON(in_nmi()), BUG_ON(in_nmi()), anything being printed out from MCE handlers. These are not easy to avoid. This patch reuses most of the code and makes it generic. It is useful for all messages and architectures that support NMI. The alternative printk_func is set when entering and is reseted when leaving NMI context. It queues IRQ work to copy the messages into the main ring buffer in a safe context. __printk_nmi_flush() copies all available messages and reset the buffer. Then we could use a simple cmpxchg operations to get synchronized with writers. There is also used a spinlock to get synchronized with other flushers. We do not longer use seq_buf because it depends on external lock. It would be hard to make all supported operations safe for a lockless use. It would be confusing and error prone to make only some operations safe. The code is put into separate printk/nmi.c as suggested by Steven Rostedt. It needs a per-CPU buffer and is compiled only on architectures that call nmi_enter(). This is achieved by the new HAVE_NMI Kconfig flag. The are MN10300 and Xtensa architectures. We need to clean up NMI handling there first. Let's do it separately. The patch is heavily based on the draft from Peter Zijlstra, see https://lkml.org/lkml/2015/6/10/327 [arnd@arndb.de: printk-nmi: use %zu format string for size_t] [akpm@linux-foundation.org: min_t->min - all types are size_t here] Signed-off-by: Petr Mladek Suggested-by: Peter Zijlstra Suggested-by: Steven Rostedt Cc: Jan Kara Acked-by: Russell King [arm part] Cc: Daniel Thompson Cc: Jiri Kosina Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: David Miller Cc: Daniel Thompson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/Makefile | 1 + kernel/printk/internal.h | 44 ++++++++++ kernel/printk/nmi.c | 219 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/printk/printk.c | 19 +--- 4 files changed, 265 insertions(+), 18 deletions(-) create mode 100644 kernel/printk/internal.h create mode 100644 kernel/printk/nmi.c (limited to 'kernel') diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 85405bdcf2b3..abb0042a427b 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -1,2 +1,3 @@ obj-y = printk.o +obj-$(CONFIG_PRINTK_NMI) += nmi.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h new file mode 100644 index 000000000000..2de99faedfc1 --- /dev/null +++ b/kernel/printk/internal.h @@ -0,0 +1,44 @@ +/* + * internal.h - printk internal definitions + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include + +typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args); + +int __printf(1, 0) vprintk_default(const char *fmt, va_list args); + +#ifdef CONFIG_PRINTK_NMI + +/* + * printk() could not take logbuf_lock in NMI context. Instead, + * it temporary stores the strings into a per-CPU buffer. + * The alternative implementation is chosen transparently + * via per-CPU variable. + */ +DECLARE_PER_CPU(printk_func_t, printk_func); +static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) +{ + return this_cpu_read(printk_func)(fmt, args); +} + +#else /* CONFIG_PRINTK_NMI */ + +static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) +{ + return vprintk_default(fmt, args); +} + +#endif /* CONFIG_PRINTK_NMI */ diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c new file mode 100644 index 000000000000..303cf0d15e57 --- /dev/null +++ b/kernel/printk/nmi.c @@ -0,0 +1,219 @@ +/* + * nmi.c - Safe printk in NMI context + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* + * printk() could not take logbuf_lock in NMI context. Instead, + * it uses an alternative implementation that temporary stores + * the strings into a per-CPU buffer. The content of the buffer + * is later flushed into the main ring buffer via IRQ work. + * + * The alternative implementation is chosen transparently + * via @printk_func per-CPU variable. + * + * The implementation allows to flush the strings also from another CPU. + * There are situations when we want to make sure that all buffers + * were handled or when IRQs are blocked. + */ +DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; +static int printk_nmi_irq_ready; + +#define NMI_LOG_BUF_LEN (4096 - sizeof(atomic_t) - sizeof(struct irq_work)) + +struct nmi_seq_buf { + atomic_t len; /* length of written data */ + struct irq_work work; /* IRQ work that flushes the buffer */ + unsigned char buffer[NMI_LOG_BUF_LEN]; +}; +static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); + +/* + * Safe printk() for NMI context. It uses a per-CPU buffer to + * store the message. NMIs are not nested, so there is always only + * one writer running. But the buffer might get flushed from another + * CPU, so we need to be careful. + */ +static int vprintk_nmi(const char *fmt, va_list args) +{ + struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); + int add = 0; + size_t len; + +again: + len = atomic_read(&s->len); + + if (len >= sizeof(s->buffer)) + return 0; + + /* + * Make sure that all old data have been read before the buffer was + * reseted. This is not needed when we just append data. + */ + if (!len) + smp_rmb(); + + add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); + + /* + * Do it once again if the buffer has been flushed in the meantime. + * Note that atomic_cmpxchg() is an implicit memory barrier that + * makes sure that the data were written before updating s->len. + */ + if (atomic_cmpxchg(&s->len, len, len + add) != len) + goto again; + + /* Get flushed in a more safe context. */ + if (add && printk_nmi_irq_ready) { + /* Make sure that IRQ work is really initialized. */ + smp_rmb(); + irq_work_queue(&s->work); + } + + return add; +} + +/* + * printk one line from the temporary buffer from @start index until + * and including the @end index. + */ +static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end) +{ + const char *buf = s->buffer + start; + + printk("%.*s", (end - start) + 1, buf); +} + +/* + * Flush data from the associated per_CPU buffer. The function + * can be called either via IRQ work or independently. + */ +static void __printk_nmi_flush(struct irq_work *work) +{ + static raw_spinlock_t read_lock = + __RAW_SPIN_LOCK_INITIALIZER(read_lock); + struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work); + unsigned long flags; + size_t len, size; + int i, last_i; + + /* + * The lock has two functions. First, one reader has to flush all + * available message to make the lockless synchronization with + * writers easier. Second, we do not want to mix messages from + * different CPUs. This is especially important when printing + * a backtrace. + */ + raw_spin_lock_irqsave(&read_lock, flags); + + i = 0; +more: + len = atomic_read(&s->len); + + /* + * This is just a paranoid check that nobody has manipulated + * the buffer an unexpected way. If we printed something then + * @len must only increase. + */ + if (i && i >= len) + pr_err("printk_nmi_flush: internal error: i=%d >= len=%zu\n", + i, len); + + if (!len) + goto out; /* Someone else has already flushed the buffer. */ + + /* Make sure that data has been written up to the @len */ + smp_rmb(); + + size = min(len, sizeof(s->buffer)); + last_i = i; + + /* Print line by line. */ + for (; i < size; i++) { + if (s->buffer[i] == '\n') { + print_nmi_seq_line(s, last_i, i); + last_i = i + 1; + } + } + /* Check if there was a partial line. */ + if (last_i < size) { + print_nmi_seq_line(s, last_i, size - 1); + pr_cont("\n"); + } + + /* + * Check that nothing has got added in the meantime and truncate + * the buffer. Note that atomic_cmpxchg() is an implicit memory + * barrier that makes sure that the data were copied before + * updating s->len. + */ + if (atomic_cmpxchg(&s->len, len, 0) != len) + goto more; + +out: + raw_spin_unlock_irqrestore(&read_lock, flags); +} + +/** + * printk_nmi_flush - flush all per-cpu nmi buffers. + * + * The buffers are flushed automatically via IRQ work. This function + * is useful only when someone wants to be sure that all buffers have + * been flushed at some point. + */ +void printk_nmi_flush(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + __printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work); +} + +void __init printk_nmi_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu); + + init_irq_work(&s->work, __printk_nmi_flush); + } + + /* Make sure that IRQ works are initialized before enabling. */ + smp_wmb(); + printk_nmi_irq_ready = 1; + + /* Flush pending messages that did not have scheduled IRQ works. */ + printk_nmi_flush(); +} + +void printk_nmi_enter(void) +{ + this_cpu_write(printk_func, vprintk_nmi); +} + +void printk_nmi_exit(void) +{ + this_cpu_write(printk_func, vprintk_default); +} diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index bfbf284e4218..71eba0607034 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -55,6 +55,7 @@ #include "console_cmdline.h" #include "braille.h" +#include "internal.h" int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ @@ -1807,14 +1808,6 @@ int vprintk_default(const char *fmt, va_list args) } EXPORT_SYMBOL_GPL(vprintk_default); -/* - * This allows printk to be diverted to another function per cpu. - * This is useful for calling printk functions from within NMI - * without worrying about race conditions that can lock up the - * box. - */ -DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; - /** * printk - print a kernel message * @fmt: format string @@ -1838,21 +1831,11 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; */ asmlinkage __visible int printk(const char *fmt, ...) { - printk_func_t vprintk_func; va_list args; int r; va_start(args, fmt); - - /* - * If a caller overrides the per_cpu printk_func, then it needs - * to disable preemption when calling printk(). Otherwise - * the printk_func should be set to the default. No need to - * disable preemption here. - */ - vprintk_func = this_cpu_read(printk_func); r = vprintk_func(fmt, args); - va_end(args); return r; -- cgit From b522deabc6f18e4f938d93a84f345f2cbf3347d1 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 20 May 2016 17:00:36 -0700 Subject: printk/nmi: warn when some message has been lost in NMI context We could not resize the temporary buffer in NMI context. Let's warn if a message is lost. This is rather theoretical. printk() should not be used in NMI. The only sensible use is when we want to print backtrace from all CPUs. The current buffer should be enough for this purpose. [akpm@linux-foundation.org: whitespace fixlet] Signed-off-by: Petr Mladek Cc: Jan Kara Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Russell King Cc: Daniel Thompson Cc: Jiri Kosina Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: David Miller Cc: Daniel Thompson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/internal.h | 11 +++++++++++ kernel/printk/nmi.c | 5 ++++- kernel/printk/printk.c | 10 ++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 2de99faedfc1..341bedccc065 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -34,6 +34,12 @@ static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) return this_cpu_read(printk_func)(fmt, args); } +extern atomic_t nmi_message_lost; +static inline int get_nmi_message_lost(void) +{ + return atomic_xchg(&nmi_message_lost, 0); +} + #else /* CONFIG_PRINTK_NMI */ static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) @@ -41,4 +47,9 @@ static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) return vprintk_default(fmt, args); } +static inline int get_nmi_message_lost(void) +{ + return 0; +} + #endif /* CONFIG_PRINTK_NMI */ diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index 303cf0d15e57..572f94922230 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -39,6 +39,7 @@ */ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; static int printk_nmi_irq_ready; +atomic_t nmi_message_lost; #define NMI_LOG_BUF_LEN (4096 - sizeof(atomic_t) - sizeof(struct irq_work)) @@ -64,8 +65,10 @@ static int vprintk_nmi(const char *fmt, va_list args) again: len = atomic_read(&s->len); - if (len >= sizeof(s->buffer)) + if (len >= sizeof(s->buffer)) { + atomic_inc(&nmi_message_lost); return 0; + } /* * Make sure that all old data have been read before the buffer was diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 71eba0607034..e38579d730f4 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1617,6 +1617,7 @@ asmlinkage int vprintk_emit(int facility, int level, unsigned long flags; int this_cpu; int printed_len = 0; + int nmi_message_lost; bool in_sched = false; /* cpu currently holding logbuf_lock in this function */ static unsigned int logbuf_cpu = UINT_MAX; @@ -1667,6 +1668,15 @@ asmlinkage int vprintk_emit(int facility, int level, strlen(recursion_msg)); } + nmi_message_lost = get_nmi_message_lost(); + if (unlikely(nmi_message_lost)) { + text_len = scnprintf(textbuf, sizeof(textbuf), + "BAD LUCK: lost %d message(s) from NMI context!", + nmi_message_lost); + printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, + NULL, 0, textbuf, text_len); + } + /* * The printf needs to come first; we need the syslog * prefix which might be passed-in as a parameter. -- cgit From 427934b8714ec130b068d1c9d88f25b24aaede32 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 20 May 2016 17:00:39 -0700 Subject: printk/nmi: increase the size of NMI buffer and make it configurable Testing has shown that the backtrace sometimes does not fit into the 4kB temporary buffer that is used in NMI context. The warnings are gone when I double the temporary buffer size. This patch doubles the buffer size and makes it configurable. Note that this problem existed even in the x86-specific implementation that was added by the commit a9edc8809328 ("x86/nmi: Perform a safe NMI stack trace on all CPUs"). Nobody noticed it because it did not print any warnings. Signed-off-by: Petr Mladek Cc: Jan Kara Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Russell King Cc: Daniel Thompson Cc: Jiri Kosina Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: David Miller Cc: Daniel Thompson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/nmi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index 572f94922230..bf08557d7e3d 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -41,7 +41,8 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; static int printk_nmi_irq_ready; atomic_t nmi_message_lost; -#define NMI_LOG_BUF_LEN (4096 - sizeof(atomic_t) - sizeof(struct irq_work)) +#define NMI_LOG_BUF_LEN ((1 << CONFIG_NMI_LOG_BUF_SHIFT) - \ + sizeof(atomic_t) - sizeof(struct irq_work)) struct nmi_seq_buf { atomic_t len; /* length of written data */ -- cgit From cf9b1106c81c45cde02208fca49d3f3e4ab6ee74 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 20 May 2016 17:00:42 -0700 Subject: printk/nmi: flush NMI messages on the system panic In NMI context, printk() messages are stored into per-CPU buffers to avoid a possible deadlock. They are normally flushed to the main ring buffer via an IRQ work. But the work is never called when the system calls panic() in the very same NMI handler. This patch tries to flush NMI buffers before the crash dump is generated. In this case it does not risk a double release and bails out when the logbuf_lock is already taken. The aim is to get the messages into the main ring buffer when possible. It makes them better accessible in the vmcore. Then the patch tries to flush the buffers second time when other CPUs are down. It might be more aggressive and reset logbuf_lock. The aim is to get the messages available for the consequent kmsg_dump() and console_flush_on_panic() calls. The patch causes vprintk_emit() to be called even in NMI context again. But it is done via printk_deferred() so that the console handling is skipped. Consoles use internal locks and we could not prevent a deadlock easily. They are explicitly called later when the crash dump is not generated, see console_flush_on_panic(). Signed-off-by: Petr Mladek Cc: Benjamin Herrenschmidt Cc: Daniel Thompson Cc: David Miller Cc: Ingo Molnar Cc: Jan Kara Cc: Jiri Kosina Cc: Martin Schwidefsky Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Russell King Cc: Steven Rostedt Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_core.c | 1 + kernel/panic.c | 6 +++++- kernel/printk/internal.h | 2 ++ kernel/printk/nmi.c | 39 ++++++++++++++++++++++++++++++++++++++- kernel/printk/printk.c | 2 +- 5 files changed, 47 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 1c03dfb4abfd..d5d408252992 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -893,6 +893,7 @@ void crash_kexec(struct pt_regs *regs) old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); if (old_cpu == PANIC_CPU_INVALID) { /* This is the 1st CPU which comes here, so go ahead. */ + printk_nmi_flush_on_panic(); __crash_kexec(regs); /* diff --git a/kernel/panic.c b/kernel/panic.c index 535c96510a44..8aa74497cc5a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -160,8 +160,10 @@ void panic(const char *fmt, ...) * * Bypass the panic_cpu check and call __crash_kexec directly. */ - if (!crash_kexec_post_notifiers) + if (!crash_kexec_post_notifiers) { + printk_nmi_flush_on_panic(); __crash_kexec(NULL); + } /* * Note smp_send_stop is the usual smp shutdown function, which @@ -176,6 +178,8 @@ void panic(const char *fmt, ...) */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); + /* Call flush even twice. It tries harder with a single online CPU */ + printk_nmi_flush_on_panic(); kmsg_dump(KMSG_DUMP_PANIC); /* diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 341bedccc065..7fd2838fa417 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -22,6 +22,8 @@ int __printf(1, 0) vprintk_default(const char *fmt, va_list args); #ifdef CONFIG_PRINTK_NMI +extern raw_spinlock_t logbuf_lock; + /* * printk() could not take logbuf_lock in NMI context. Instead, * it temporary stores the strings into a per-CPU buffer. diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index bf08557d7e3d..b69eb8a2876f 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -106,7 +107,16 @@ static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end) { const char *buf = s->buffer + start; - printk("%.*s", (end - start) + 1, buf); + /* + * The buffers are flushed in NMI only on panic. The messages must + * go only into the ring buffer at this stage. Consoles will get + * explicitly called later when a crashdump is not generated. + */ + if (in_nmi()) + printk_deferred("%.*s", (end - start) + 1, buf); + else + printk("%.*s", (end - start) + 1, buf); + } /* @@ -194,6 +204,33 @@ void printk_nmi_flush(void) __printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work); } +/** + * printk_nmi_flush_on_panic - flush all per-cpu nmi buffers when the system + * goes down. + * + * Similar to printk_nmi_flush() but it can be called even in NMI context when + * the system goes down. It does the best effort to get NMI messages into + * the main ring buffer. + * + * Note that it could try harder when there is only one CPU online. + */ +void printk_nmi_flush_on_panic(void) +{ + /* + * Make sure that we could access the main ring buffer. + * Do not risk a double release when more CPUs are up. + */ + if (in_nmi() && raw_spin_is_locked(&logbuf_lock)) { + if (num_online_cpus() > 1) + return; + + debug_locks_off(); + raw_spin_lock_init(&logbuf_lock); + } + + printk_nmi_flush(); +} + void __init printk_nmi_init(void) { int cpu; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index e38579d730f4..60cdf6386763 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -245,7 +245,7 @@ __packed __aligned(4) * within the scheduler's rq lock. It must be released before calling * console_unlock() or anything else that might wake up a process. */ -static DEFINE_RAW_SPINLOCK(logbuf_lock); +DEFINE_RAW_SPINLOCK(logbuf_lock); #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); -- cgit From ede9c27749b9b35efdffa4f63a39f819d7913752 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 20 May 2016 17:01:10 -0700 Subject: kernel/sysctl_binary.c: use generic UUID library UUID library provides uuid_be type and uuid_be_to_bin() function. This substitutes open coded variant by generic library calls. Signed-off-by: Andy Shevchenko Reviewed-by: Matt Fleming Cc: Dmitry Kasatkin Cc: Mimi Zohar Cc: Rasmus Villemoes Cc: Arnd Bergmann Cc: "Theodore Ts'o" Cc: Al Viro Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl_binary.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 10a1d7dc9313..6eb99c17dbd8 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -1117,9 +1118,8 @@ static ssize_t bin_uuid(struct file *file, /* Only supports reads */ if (oldval && oldlen) { - char buf[40], *str = buf; - unsigned char uuid[16]; - int i; + char buf[UUID_STRING_LEN + 1]; + uuid_be uuid; result = kernel_read(file, 0, buf, sizeof(buf) - 1); if (result < 0) @@ -1127,24 +1127,15 @@ static ssize_t bin_uuid(struct file *file, buf[result] = '\0'; - /* Convert the uuid to from a string to binary */ - for (i = 0; i < 16; i++) { - result = -EIO; - if (!isxdigit(str[0]) || !isxdigit(str[1])) - goto out; - - uuid[i] = (hex_to_bin(str[0]) << 4) | - hex_to_bin(str[1]); - str += 2; - if (*str == '-') - str++; - } + result = -EIO; + if (uuid_be_to_bin(buf, &uuid)) + goto out; if (oldlen > 16) oldlen = 16; result = -EFAULT; - if (copy_to_user(oldval, uuid, oldlen)) + if (copy_to_user(oldval, &uuid, oldlen)) goto out; copied = oldlen; -- cgit From e9256efcc8e390fa4fcf796a0c0b47d642d77d32 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 20 May 2016 17:01:33 -0700 Subject: radix-tree: introduce radix_tree_empty Commit e61452365372 ("radix_tree: add support for multi-order entries") left the impression that the support for multiorder radix tree entries was functional. As soon as Ross tried to use it, it became apparent that my testing was completely inadequate, and it didn't even work a little bit for orders that were not a multiple of shift. This series of patches is the result of about 6 weeks of redesign, reimplementation, testing, arguing and hair-pulling. The great news is that the test-suite is now far better than it was. That's reflected in the diffstat for the test-suite alone: 12 files changed, 436 insertions(+), 28 deletions(-) The highlight for users of the tree is that the restriction on the order of inserted entries being >= RADIX_TREE_MAP_SHIFT is now gone; the radix tree now supports any order between 0 and 64. For those who are interested in how the tree works, patch 9 is probably the most interesting one as it introduces the new machinery for handling sibling entries. I've tried to be fair in attributing authorship to the person who contributed the majority of the code in each patch; Ross has been an invaluable partner in the development of this support and it's fair to say that each of us has code in every commit. I should also express my appreciation of the 0day testing. It prompted me that I was bloating the tinyconfig in an unacceptable way, and it bisected to a commit which contained a rather nasty memory-corruption bug. This patch (of 29): The irqdomain code was checking for 0 or 1 entries, not 0 entries like the comment said they were. Introduce a new helper that will actually check for an empty tree. Signed-off-by: Matthew Wilcox Reviewed-by: Ross Zwisler Reviewed-by: Jan Kara Cc: Konstantin Khlebnikov Cc: Kirill Shutemov Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/irqdomain.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index d65f6f31a5b3..8798b6c9e945 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -139,12 +139,7 @@ void irq_domain_remove(struct irq_domain *domain) { mutex_lock(&irq_domain_mutex); - /* - * radix_tree_delete() takes care of destroying the root - * node when all entries are removed. Shout if there are - * any mappings left. - */ - WARN_ON(domain->revmap_tree.height); + WARN_ON(!radix_tree_empty(&domain->revmap_tree)); list_del(&domain->link); -- cgit