summaryrefslogtreecommitdiff
path: root/kernel/printk
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/printk')
-rw-r--r--kernel/printk/Makefile7
-rw-r--r--kernel/printk/console_cmdline.h1
-rw-r--r--kernel/printk/index.c194
-rw-r--r--kernel/printk/internal.h324
-rw-r--r--kernel/printk/nbcon.c1816
-rw-r--r--kernel/printk/printk.c4104
-rw-r--r--kernel/printk/printk_ringbuffer.c2353
-rw-r--r--kernel/printk/printk_ringbuffer.h437
-rw-r--r--kernel/printk/printk_safe.c400
-rw-r--r--kernel/printk/sysctl.c84
10 files changed, 8073 insertions, 1647 deletions
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 4d052fc6bcde..39a2b61c7232 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,4 +1,9 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-y = printk.o
-obj-$(CONFIG_PRINTK) += printk_safe.o
+obj-$(CONFIG_PRINTK) += printk_safe.o nbcon.o
obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
+obj-$(CONFIG_PRINTK_INDEX) += index.o
+
+obj-$(CONFIG_PRINTK) += printk_support.o
+printk_support-y := printk_ringbuffer.o
+printk_support-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h
index 3ca74ad391d6..0ab573b6d4dc 100644
--- a/kernel/printk/console_cmdline.h
+++ b/kernel/printk/console_cmdline.h
@@ -6,6 +6,7 @@ struct console_cmdline
{
char name[16]; /* Name of the driver */
int index; /* Minor dev. to use */
+ char devname[32]; /* DEVNAME:0.0 style device name */
bool user_specified; /* Specified by command line vs. platform */
char *options; /* Options for the driver */
#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
diff --git a/kernel/printk/index.c b/kernel/printk/index.c
new file mode 100644
index 000000000000..a6b27526baaf
--- /dev/null
+++ b/kernel/printk/index.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Userspace indexing of printk formats
+ */
+
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/string_helpers.h>
+
+#include "internal.h"
+
+extern struct pi_entry *__start_printk_index[];
+extern struct pi_entry *__stop_printk_index[];
+
+/* The base dir for module formats, typically debugfs/printk/index/ */
+static struct dentry *dfs_index;
+
+static struct pi_entry *pi_get_entry(const struct module *mod, loff_t pos)
+{
+ struct pi_entry **entries;
+ unsigned int nr_entries;
+
+#ifdef CONFIG_MODULES
+ if (mod) {
+ entries = mod->printk_index_start;
+ nr_entries = mod->printk_index_size;
+ } else
+#endif
+ {
+ /* vmlinux, comes from linker symbols */
+ entries = __start_printk_index;
+ nr_entries = __stop_printk_index - __start_printk_index;
+ }
+
+ if (pos >= nr_entries)
+ return NULL;
+
+ return entries[pos];
+}
+
+static void *pi_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ const struct module *mod = s->file->f_inode->i_private;
+ struct pi_entry *entry = pi_get_entry(mod, *pos);
+
+ (*pos)++;
+
+ return entry;
+}
+
+static void *pi_start(struct seq_file *s, loff_t *pos)
+{
+ /*
+ * Make show() print the header line. Do not update *pos because
+ * pi_next() still has to return the entry at index 0 later.
+ */
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ return pi_next(s, NULL, pos);
+}
+
+/*
+ * We need both ESCAPE_ANY and explicit characters from ESCAPE_SPECIAL in @only
+ * because otherwise ESCAPE_NAP will cause double quotes and backslashes to be
+ * ignored for quoting.
+ */
+#define seq_escape_printf_format(s, src) \
+ seq_escape_str(s, src, ESCAPE_ANY | ESCAPE_NAP | ESCAPE_APPEND, "\"\\")
+
+static int pi_show(struct seq_file *s, void *v)
+{
+ const struct pi_entry *entry = v;
+ int level = LOGLEVEL_DEFAULT;
+ enum printk_info_flags flags = 0;
+ u16 prefix_len = 0;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(s, "# <level/flags> filename:line function \"format\"\n");
+ return 0;
+ }
+
+ if (!entry->fmt)
+ return 0;
+
+ if (entry->level)
+ printk_parse_prefix(entry->level, &level, &flags);
+ else
+ prefix_len = printk_parse_prefix(entry->fmt, &level, &flags);
+
+
+ if (flags & LOG_CONT) {
+ /*
+ * LOGLEVEL_DEFAULT here means "use the same level as the
+ * message we're continuing from", not the default message
+ * loglevel, so don't display it as such.
+ */
+ if (level == LOGLEVEL_DEFAULT)
+ seq_puts(s, "<c>");
+ else
+ seq_printf(s, "<%d,c>", level);
+ } else
+ seq_printf(s, "<%d>", level);
+
+ seq_printf(s, " %s:%d %s \"", entry->file, entry->line, entry->func);
+ if (entry->subsys_fmt_prefix)
+ seq_escape_printf_format(s, entry->subsys_fmt_prefix);
+ seq_escape_printf_format(s, entry->fmt + prefix_len);
+ seq_puts(s, "\"\n");
+
+ return 0;
+}
+
+static void pi_stop(struct seq_file *p, void *v) { }
+
+static const struct seq_operations dfs_index_sops = {
+ .start = pi_start,
+ .next = pi_next,
+ .show = pi_show,
+ .stop = pi_stop,
+};
+
+DEFINE_SEQ_ATTRIBUTE(dfs_index);
+
+#ifdef CONFIG_MODULES
+static const char *pi_get_module_name(struct module *mod)
+{
+ return mod ? mod->name : "vmlinux";
+}
+#else
+static const char *pi_get_module_name(struct module *mod)
+{
+ return "vmlinux";
+}
+#endif
+
+static void pi_create_file(struct module *mod)
+{
+ debugfs_create_file(pi_get_module_name(mod), 0444, dfs_index,
+ mod, &dfs_index_fops);
+}
+
+#ifdef CONFIG_MODULES
+static void pi_remove_file(struct module *mod)
+{
+ debugfs_lookup_and_remove(pi_get_module_name(mod), dfs_index);
+}
+
+static int pi_module_notify(struct notifier_block *nb, unsigned long op,
+ void *data)
+{
+ struct module *mod = data;
+
+ switch (op) {
+ case MODULE_STATE_COMING:
+ pi_create_file(mod);
+ break;
+ case MODULE_STATE_GOING:
+ pi_remove_file(mod);
+ break;
+ default: /* we don't care about other module states */
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block module_printk_fmts_nb = {
+ .notifier_call = pi_module_notify,
+};
+
+static void __init pi_setup_module_notifier(void)
+{
+ register_module_notifier(&module_printk_fmts_nb);
+}
+#else
+static inline void __init pi_setup_module_notifier(void) { }
+#endif
+
+static int __init pi_init(void)
+{
+ struct dentry *dfs_root = debugfs_create_dir("printk", NULL);
+
+ dfs_index = debugfs_create_dir("index", dfs_root);
+ pi_setup_module_notifier();
+ pi_create_file(NULL);
+
+ return 0;
+}
+
+/* debugfs comes up on core and must be initialised first */
+postcore_initcall(pi_init);
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 660f9a6bf73a..a91bdf802967 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -2,30 +2,80 @@
/*
* internal.h - printk internal definitions
*/
+#include <linux/console.h>
#include <linux/percpu.h>
+#include <linux/types.h>
+
+#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
+struct ctl_table;
+void __init printk_sysctl_init(void);
+int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos);
+#else
+#define printk_sysctl_init() do { } while (0)
+#endif
+
+#define con_printk(lvl, con, fmt, ...) \
+ printk(lvl pr_fmt("%s%sconsole [%s%d] " fmt), \
+ (con->flags & CON_NBCON) ? "" : "legacy ", \
+ (con->flags & CON_BOOT) ? "boot" : "", \
+ con->name, con->index, ##__VA_ARGS__)
+
+/*
+ * Identify if legacy printing is forced in a dedicated kthread. If
+ * true, all printing via console lock occurs within a dedicated
+ * legacy printer thread. The only exception is on panic, after the
+ * nbcon consoles have had their chance to print the panic messages
+ * first.
+ */
+#ifdef CONFIG_PREEMPT_RT
+# define force_legacy_kthread() (true)
+#else
+# define force_legacy_kthread() (false)
+#endif
#ifdef CONFIG_PRINTK
-#define PRINTK_SAFE_CONTEXT_MASK 0x007ffffff
-#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x008000000
-#define PRINTK_NMI_CONTEXT_MASK 0xff0000000
+#ifdef CONFIG_PRINTK_CALLER
+#define PRINTK_PREFIX_MAX 48
+#else
+#define PRINTK_PREFIX_MAX 32
+#endif
-#define PRINTK_NMI_CONTEXT_OFFSET 0x010000000
+/*
+ * the maximum size of a formatted record (i.e. with prefix added
+ * per line and dropped messages or in extended message format)
+ */
+#define PRINTK_MESSAGE_MAX 2048
+
+/* the maximum size allowed to be reserved for a record */
+#define PRINTKRB_RECORD_MAX 1024
+
+/* Flags for a single printk record. */
+enum printk_info_flags {
+ /* always show on console, ignore console_loglevel */
+ LOG_FORCE_CON = 1,
+ LOG_NEWLINE = 2, /* text ended with a newline */
+ LOG_CONT = 8, /* text is a fragment of a continuation line */
+};
-extern raw_spinlock_t logbuf_lock;
+struct printk_ringbuffer;
+struct dev_printk_info;
-__printf(5, 0)
+extern struct printk_ringbuffer *prb;
+extern bool printk_kthreads_running;
+
+__printf(4, 0)
int vprintk_store(int facility, int level,
- const char *dict, size_t dictlen,
+ const struct dev_printk_info *dev_info,
const char *fmt, va_list args);
__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
-__printf(1, 0) int vprintk_func(const char *fmt, va_list args);
+
void __printk_safe_enter(void);
void __printk_safe_exit(void);
-void printk_safe_init(void);
bool printk_percpu_data_ready(void);
#define printk_safe_enter_irqsave(flags) \
@@ -40,35 +90,257 @@ bool printk_percpu_data_ready(void);
local_irq_restore(flags); \
} while (0)
-#define printk_safe_enter_irq() \
- do { \
- local_irq_disable(); \
- __printk_safe_enter(); \
- } while (0)
+void defer_console_output(void);
+bool is_printk_legacy_deferred(void);
+bool is_printk_force_console(void);
-#define printk_safe_exit_irq() \
- do { \
- __printk_safe_exit(); \
- local_irq_enable(); \
- } while (0)
+u16 printk_parse_prefix(const char *text, int *level,
+ enum printk_info_flags *flags);
+void console_lock_spinning_enable(void);
+int console_lock_spinning_disable_and_check(int cookie);
-void defer_console_output(void);
+u64 nbcon_seq_read(struct console *con);
+void nbcon_seq_force(struct console *con, u64 seq);
+bool nbcon_alloc(struct console *con);
+void nbcon_free(struct console *con);
+enum nbcon_prio nbcon_get_default_prio(void);
+void nbcon_atomic_flush_pending(void);
+bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
+ int cookie, bool use_atomic);
+bool nbcon_kthread_create(struct console *con);
+void nbcon_kthread_stop(struct console *con);
+void nbcon_kthreads_wake(void);
+
+/*
+ * Check if the given console is currently capable and allowed to print
+ * records. Note that this function does not consider the current context,
+ * which can also play a role in deciding if @con can be used to print
+ * records.
+ */
+static inline bool console_is_usable(struct console *con, short flags, bool use_atomic)
+{
+ if (!(flags & CON_ENABLED))
+ return false;
+
+ if ((flags & CON_SUSPENDED))
+ return false;
+
+ if (flags & CON_NBCON) {
+ /* The write_atomic() callback is optional. */
+ if (use_atomic && !con->write_atomic)
+ return false;
+
+ /*
+ * For the !use_atomic case, @printk_kthreads_running is not
+ * checked because the write_thread() callback is also used
+ * via the legacy loop when the printer threads are not
+ * available.
+ */
+ } else {
+ if (!con->write)
+ return false;
+ }
+
+ /*
+ * Console drivers may assume that per-cpu resources have been
+ * allocated. So unless they're explicitly marked as being able to
+ * cope (CON_ANYTIME) don't call them until this CPU is officially up.
+ */
+ if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME))
+ return false;
+
+ return true;
+}
+
+/**
+ * nbcon_kthread_wake - Wake up a console printing thread
+ * @con: Console to operate on
+ */
+static inline void nbcon_kthread_wake(struct console *con)
+{
+ /*
+ * Guarantee any new records can be seen by tasks preparing to wait
+ * before this context checks if the rcuwait is empty.
+ *
+ * The full memory barrier in rcuwait_wake_up() pairs with the full
+ * memory barrier within set_current_state() of
+ * ___rcuwait_wait_event(), which is called after prepare_to_rcuwait()
+ * adds the waiter but before it has checked the wait condition.
+ *
+ * This pairs with nbcon_kthread_func:A.
+ */
+ rcuwait_wake_up(&con->rcuwait); /* LMM(nbcon_kthread_wake:A) */
+}
#else
-__printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; }
+#define PRINTK_PREFIX_MAX 0
+#define PRINTK_MESSAGE_MAX 0
+#define PRINTKRB_RECORD_MAX 0
+
+#define printk_kthreads_running (false)
/*
- * In !PRINTK builds we still export logbuf_lock spin_lock, console_sem
+ * In !PRINTK builds we still export console_sem
* semaphore and some of console functions (console_unlock()/etc.), so
* printk-safe must preserve the existing local IRQ guarantees.
*/
#define printk_safe_enter_irqsave(flags) local_irq_save(flags)
#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
-#define printk_safe_enter_irq() local_irq_disable()
-#define printk_safe_exit_irq() local_irq_enable()
-
-static inline void printk_safe_init(void) { }
static inline bool printk_percpu_data_ready(void) { return false; }
+static inline void defer_console_output(void) { }
+static inline bool is_printk_legacy_deferred(void) { return false; }
+static inline u64 nbcon_seq_read(struct console *con) { return 0; }
+static inline void nbcon_seq_force(struct console *con, u64 seq) { }
+static inline bool nbcon_alloc(struct console *con) { return false; }
+static inline void nbcon_free(struct console *con) { }
+static inline enum nbcon_prio nbcon_get_default_prio(void) { return NBCON_PRIO_NONE; }
+static inline void nbcon_atomic_flush_pending(void) { }
+static inline bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
+ int cookie, bool use_atomic) { return false; }
+static inline void nbcon_kthread_wake(struct console *con) { }
+static inline void nbcon_kthreads_wake(void) { }
+
+static inline bool console_is_usable(struct console *con, short flags,
+ bool use_atomic) { return false; }
+
#endif /* CONFIG_PRINTK */
+
+extern bool have_boot_console;
+extern bool have_nbcon_console;
+extern bool have_legacy_console;
+extern bool legacy_allow_panic_sync;
+
+/**
+ * struct console_flush_type - Define available console flush methods
+ * @nbcon_atomic: Flush directly using nbcon_atomic() callback
+ * @nbcon_offload: Offload flush to printer thread
+ * @legacy_direct: Call the legacy loop in this context
+ * @legacy_offload: Offload the legacy loop into IRQ or legacy thread
+ *
+ * Note that the legacy loop also flushes the nbcon consoles.
+ */
+struct console_flush_type {
+ bool nbcon_atomic;
+ bool nbcon_offload;
+ bool legacy_direct;
+ bool legacy_offload;
+};
+
+/*
+ * Identify which console flushing methods should be used in the context of
+ * the caller.
+ */
+static inline void printk_get_console_flush_type(struct console_flush_type *ft)
+{
+ memset(ft, 0, sizeof(*ft));
+
+ switch (nbcon_get_default_prio()) {
+ case NBCON_PRIO_NORMAL:
+ if (have_nbcon_console && !have_boot_console) {
+ if (printk_kthreads_running)
+ ft->nbcon_offload = true;
+ else
+ ft->nbcon_atomic = true;
+ }
+
+ /* Legacy consoles are flushed directly when possible. */
+ if (have_legacy_console || have_boot_console) {
+ if (!is_printk_legacy_deferred())
+ ft->legacy_direct = true;
+ else
+ ft->legacy_offload = true;
+ }
+ break;
+
+ case NBCON_PRIO_EMERGENCY:
+ if (have_nbcon_console && !have_boot_console)
+ ft->nbcon_atomic = true;
+
+ /* Legacy consoles are flushed directly when possible. */
+ if (have_legacy_console || have_boot_console) {
+ if (!is_printk_legacy_deferred())
+ ft->legacy_direct = true;
+ else
+ ft->legacy_offload = true;
+ }
+ break;
+
+ case NBCON_PRIO_PANIC:
+ /*
+ * In panic, the nbcon consoles will directly print. But
+ * only allowed if there are no boot consoles.
+ */
+ if (have_nbcon_console && !have_boot_console)
+ ft->nbcon_atomic = true;
+
+ if (have_legacy_console || have_boot_console) {
+ /*
+ * This is the same decision as NBCON_PRIO_NORMAL
+ * except that offloading never occurs in panic.
+ *
+ * Note that console_flush_on_panic() will flush
+ * legacy consoles anyway, even if unsafe.
+ */
+ if (!is_printk_legacy_deferred())
+ ft->legacy_direct = true;
+
+ /*
+ * In panic, if nbcon atomic printing occurs,
+ * the legacy consoles must remain silent until
+ * explicitly allowed.
+ */
+ if (ft->nbcon_atomic && !legacy_allow_panic_sync)
+ ft->legacy_direct = false;
+ }
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
+extern struct printk_buffers printk_shared_pbufs;
+
+/**
+ * struct printk_buffers - Buffers to read/format/output printk messages.
+ * @outbuf: After formatting, contains text to output.
+ * @scratchbuf: Used as temporary ringbuffer reading and string-print space.
+ */
+struct printk_buffers {
+ char outbuf[PRINTK_MESSAGE_MAX];
+ char scratchbuf[PRINTKRB_RECORD_MAX];
+};
+
+/**
+ * struct printk_message - Container for a prepared printk message.
+ * @pbufs: printk buffers used to prepare the message.
+ * @outbuf_len: The length of prepared text in @pbufs->outbuf to output. This
+ * does not count the terminator. A value of 0 means there is
+ * nothing to output and this record should be skipped.
+ * @seq: The sequence number of the record used for @pbufs->outbuf.
+ * @dropped: The number of dropped records from reading @seq.
+ */
+struct printk_message {
+ struct printk_buffers *pbufs;
+ unsigned int outbuf_len;
+ u64 seq;
+ unsigned long dropped;
+};
+
+bool other_cpu_in_panic(void);
+bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
+ bool is_extended, bool may_supress);
+
+#ifdef CONFIG_PRINTK
+void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped);
+void console_prepend_replay(struct printk_message *pmsg);
+#endif
+
+#ifdef CONFIG_SMP
+bool is_printk_cpu_sync_owner(void);
+#else
+static inline bool is_printk_cpu_sync_owner(void) { return false; }
+#endif
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
new file mode 100644
index 000000000000..fd12efcc4aed
--- /dev/null
+++ b/kernel/printk/nbcon.c
@@ -0,0 +1,1816 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2022 Linutronix GmbH, John Ogness
+// Copyright (C) 2022 Intel, Thomas Gleixner
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/irqflags.h>
+#include <linux/kthread.h>
+#include <linux/minmax.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include "internal.h"
+#include "printk_ringbuffer.h"
+/*
+ * Printk console printing implementation for consoles which does not depend
+ * on the legacy style console_lock mechanism.
+ *
+ * The state of the console is maintained in the "nbcon_state" atomic
+ * variable.
+ *
+ * The console is locked when:
+ *
+ * - The 'prio' field contains the priority of the context that owns the
+ * console. Only higher priority contexts are allowed to take over the
+ * lock. A value of 0 (NBCON_PRIO_NONE) means the console is not locked.
+ *
+ * - The 'cpu' field denotes on which CPU the console is locked. It is used
+ * to prevent busy waiting on the same CPU. Also it informs the lock owner
+ * that it has lost the lock in a more complex scenario when the lock was
+ * taken over by a higher priority context, released, and taken on another
+ * CPU with the same priority as the interrupted owner.
+ *
+ * The acquire mechanism uses a few more fields:
+ *
+ * - The 'req_prio' field is used by the handover approach to make the
+ * current owner aware that there is a context with a higher priority
+ * waiting for the friendly handover.
+ *
+ * - The 'unsafe' field allows to take over the console in a safe way in the
+ * middle of emitting a message. The field is set only when accessing some
+ * shared resources or when the console device is manipulated. It can be
+ * cleared, for example, after emitting one character when the console
+ * device is in a consistent state.
+ *
+ * - The 'unsafe_takeover' field is set when a hostile takeover took the
+ * console in an unsafe state. The console will stay in the unsafe state
+ * until re-initialized.
+ *
+ * The acquire mechanism uses three approaches:
+ *
+ * 1) Direct acquire when the console is not owned or is owned by a lower
+ * priority context and is in a safe state.
+ *
+ * 2) Friendly handover mechanism uses a request/grant handshake. It is used
+ * when the current owner has lower priority and the console is in an
+ * unsafe state.
+ *
+ * The requesting context:
+ *
+ * a) Sets its priority into the 'req_prio' field.
+ *
+ * b) Waits (with a timeout) for the owning context to unlock the
+ * console.
+ *
+ * c) Takes the lock and clears the 'req_prio' field.
+ *
+ * The owning context:
+ *
+ * a) Observes the 'req_prio' field set on exit from the unsafe
+ * console state.
+ *
+ * b) Gives up console ownership by clearing the 'prio' field.
+ *
+ * 3) Unsafe hostile takeover allows to take over the lock even when the
+ * console is an unsafe state. It is used only in panic() by the final
+ * attempt to flush consoles in a try and hope mode.
+ *
+ * Note that separate record buffers are used in panic(). As a result,
+ * the messages can be read and formatted without any risk even after
+ * using the hostile takeover in unsafe state.
+ *
+ * The release function simply clears the 'prio' field.
+ *
+ * All operations on @console::nbcon_state are atomic cmpxchg based to
+ * handle concurrency.
+ *
+ * The acquire/release functions implement only minimal policies:
+ *
+ * - Preference for higher priority contexts.
+ * - Protection of the panic CPU.
+ *
+ * All other policy decisions must be made at the call sites:
+ *
+ * - What is marked as an unsafe section.
+ * - Whether to spin-wait if there is already an owner and the console is
+ * in an unsafe state.
+ * - Whether to attempt an unsafe hostile takeover.
+ *
+ * The design allows to implement the well known:
+ *
+ * acquire()
+ * output_one_printk_record()
+ * release()
+ *
+ * The output of one printk record might be interrupted with a higher priority
+ * context. The new owner is supposed to reprint the entire interrupted record
+ * from scratch.
+ */
+
+/**
+ * nbcon_state_set - Helper function to set the console state
+ * @con: Console to update
+ * @new: The new state to write
+ *
+ * Only to be used when the console is not yet or no longer visible in the
+ * system. Otherwise use nbcon_state_try_cmpxchg().
+ */
+static inline void nbcon_state_set(struct console *con, struct nbcon_state *new)
+{
+ atomic_set(&ACCESS_PRIVATE(con, nbcon_state), new->atom);
+}
+
+/**
+ * nbcon_state_read - Helper function to read the console state
+ * @con: Console to read
+ * @state: The state to store the result
+ */
+static inline void nbcon_state_read(struct console *con, struct nbcon_state *state)
+{
+ state->atom = atomic_read(&ACCESS_PRIVATE(con, nbcon_state));
+}
+
+/**
+ * nbcon_state_try_cmpxchg() - Helper function for atomic_try_cmpxchg() on console state
+ * @con: Console to update
+ * @cur: Old/expected state
+ * @new: New state
+ *
+ * Return: True on success. False on fail and @cur is updated.
+ */
+static inline bool nbcon_state_try_cmpxchg(struct console *con, struct nbcon_state *cur,
+ struct nbcon_state *new)
+{
+ return atomic_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_state), &cur->atom, new->atom);
+}
+
+/**
+ * nbcon_seq_read - Read the current console sequence
+ * @con: Console to read the sequence of
+ *
+ * Return: Sequence number of the next record to print on @con.
+ */
+u64 nbcon_seq_read(struct console *con)
+{
+ unsigned long nbcon_seq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_seq));
+
+ return __ulseq_to_u64seq(prb, nbcon_seq);
+}
+
+/**
+ * nbcon_seq_force - Force console sequence to a specific value
+ * @con: Console to work on
+ * @seq: Sequence number value to set
+ *
+ * Only to be used during init (before registration) or in extreme situations
+ * (such as panic with CONSOLE_REPLAY_ALL).
+ */
+void nbcon_seq_force(struct console *con, u64 seq)
+{
+ /*
+ * If the specified record no longer exists, the oldest available record
+ * is chosen. This is especially important on 32bit systems because only
+ * the lower 32 bits of the sequence number are stored. The upper 32 bits
+ * are derived from the sequence numbers available in the ringbuffer.
+ */
+ u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb));
+
+ atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __u64seq_to_ulseq(valid_seq));
+}
+
+/**
+ * nbcon_seq_try_update - Try to update the console sequence number
+ * @ctxt: Pointer to an acquire context that contains
+ * all information about the acquire mode
+ * @new_seq: The new sequence number to set
+ *
+ * @ctxt->seq is updated to the new value of @con::nbcon_seq (expanded to
+ * the 64bit value). This could be a different value than @new_seq if
+ * nbcon_seq_force() was used or the current context no longer owns the
+ * console. In the later case, it will stop printing anyway.
+ */
+static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq)
+{
+ unsigned long nbcon_seq = __u64seq_to_ulseq(ctxt->seq);
+ struct console *con = ctxt->console;
+
+ if (atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_seq), &nbcon_seq,
+ __u64seq_to_ulseq(new_seq))) {
+ ctxt->seq = new_seq;
+ } else {
+ ctxt->seq = nbcon_seq_read(con);
+ }
+}
+
+/**
+ * nbcon_context_try_acquire_direct - Try to acquire directly
+ * @ctxt: The context of the caller
+ * @cur: The current console state
+ *
+ * Acquire the console when it is released. Also acquire the console when
+ * the current owner has a lower priority and the console is in a safe state.
+ *
+ * Return: 0 on success. Otherwise, an error code on failure. Also @cur
+ * is updated to the latest state when failed to modify it.
+ *
+ * Errors:
+ *
+ * -EPERM: A panic is in progress and this is not the panic CPU.
+ * Or the current owner or waiter has the same or higher
+ * priority. No acquire method can be successful in
+ * this case.
+ *
+ * -EBUSY: The current owner has a lower priority but the console
+ * in an unsafe state. The caller should try using
+ * the handover acquire method.
+ */
+static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt,
+ struct nbcon_state *cur)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state new;
+
+ do {
+ /*
+ * Panic does not imply that the console is owned. However, it
+ * is critical that non-panic CPUs during panic are unable to
+ * acquire ownership in order to satisfy the assumptions of
+ * nbcon_waiter_matches(). In particular, the assumption that
+ * lower priorities are ignored during panic.
+ */
+ if (other_cpu_in_panic())
+ return -EPERM;
+
+ if (ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio)
+ return -EPERM;
+
+ if (cur->unsafe)
+ return -EBUSY;
+
+ /*
+ * The console should never be safe for a direct acquire
+ * if an unsafe hostile takeover has ever happened.
+ */
+ WARN_ON_ONCE(cur->unsafe_takeover);
+
+ new.atom = cur->atom;
+ new.prio = ctxt->prio;
+ new.req_prio = NBCON_PRIO_NONE;
+ new.unsafe = cur->unsafe_takeover;
+ new.cpu = cpu;
+
+ } while (!nbcon_state_try_cmpxchg(con, cur, &new));
+
+ return 0;
+}
+
+static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio)
+{
+ /*
+ * The request context is well defined by the @req_prio because:
+ *
+ * - Only a context with a priority higher than the owner can become
+ * a waiter.
+ * - Only a context with a priority higher than the waiter can
+ * directly take over the request.
+ * - There are only three priorities.
+ * - Only one CPU is allowed to request PANIC priority.
+ * - Lower priorities are ignored during panic() until reboot.
+ *
+ * As a result, the following scenario is *not* possible:
+ *
+ * 1. This context is currently a waiter.
+ * 2. Another context with a higher priority than this context
+ * directly takes ownership.
+ * 3. The higher priority context releases the ownership.
+ * 4. Another lower priority context takes the ownership.
+ * 5. Another context with the same priority as this context
+ * creates a request and starts waiting.
+ *
+ * Event #1 implies this context is EMERGENCY.
+ * Event #2 implies the new context is PANIC.
+ * Event #3 occurs when panic() has flushed the console.
+ * Events #4 and #5 are not possible due to the other_cpu_in_panic()
+ * check in nbcon_context_try_acquire_direct().
+ */
+
+ return (cur->req_prio == expected_prio);
+}
+
+/**
+ * nbcon_context_try_acquire_requested - Try to acquire after having
+ * requested a handover
+ * @ctxt: The context of the caller
+ * @cur: The current console state
+ *
+ * This is a helper function for nbcon_context_try_acquire_handover().
+ * It is called when the console is in an unsafe state. The current
+ * owner will release the console on exit from the unsafe region.
+ *
+ * Return: 0 on success and @cur is updated to the new console state.
+ * Otherwise an error code on failure.
+ *
+ * Errors:
+ *
+ * -EPERM: A panic is in progress and this is not the panic CPU
+ * or this context is no longer the waiter.
+ *
+ * -EBUSY: The console is still locked. The caller should
+ * continue waiting.
+ *
+ * Note: The caller must still remove the request when an error has occurred
+ * except when this context is no longer the waiter.
+ */
+static int nbcon_context_try_acquire_requested(struct nbcon_context *ctxt,
+ struct nbcon_state *cur)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state new;
+
+ /* Note that the caller must still remove the request! */
+ if (other_cpu_in_panic())
+ return -EPERM;
+
+ /*
+ * Note that the waiter will also change if there was an unsafe
+ * hostile takeover.
+ */
+ if (!nbcon_waiter_matches(cur, ctxt->prio))
+ return -EPERM;
+
+ /* If still locked, caller should continue waiting. */
+ if (cur->prio != NBCON_PRIO_NONE)
+ return -EBUSY;
+
+ /*
+ * The previous owner should have never released ownership
+ * in an unsafe region.
+ */
+ WARN_ON_ONCE(cur->unsafe);
+
+ new.atom = cur->atom;
+ new.prio = ctxt->prio;
+ new.req_prio = NBCON_PRIO_NONE;
+ new.unsafe = cur->unsafe_takeover;
+ new.cpu = cpu;
+
+ if (!nbcon_state_try_cmpxchg(con, cur, &new)) {
+ /*
+ * The acquire could fail only when it has been taken
+ * over by a higher priority context.
+ */
+ WARN_ON_ONCE(nbcon_waiter_matches(cur, ctxt->prio));
+ return -EPERM;
+ }
+
+ /* Handover success. This context now owns the console. */
+ return 0;
+}
+
+/**
+ * nbcon_context_try_acquire_handover - Try to acquire via handover
+ * @ctxt: The context of the caller
+ * @cur: The current console state
+ *
+ * The function must be called only when the context has higher priority
+ * than the current owner and the console is in an unsafe state.
+ * It is the case when nbcon_context_try_acquire_direct() returns -EBUSY.
+ *
+ * The function sets "req_prio" field to make the current owner aware of
+ * the request. Then it waits until the current owner releases the console,
+ * or an even higher context takes over the request, or timeout expires.
+ *
+ * The current owner checks the "req_prio" field on exit from the unsafe
+ * region and releases the console. It does not touch the "req_prio" field
+ * so that the console stays reserved for the waiter.
+ *
+ * Return: 0 on success. Otherwise, an error code on failure. Also @cur
+ * is updated to the latest state when failed to modify it.
+ *
+ * Errors:
+ *
+ * -EPERM: A panic is in progress and this is not the panic CPU.
+ * Or a higher priority context has taken over the
+ * console or the handover request.
+ *
+ * -EBUSY: The current owner is on the same CPU so that the hand
+ * shake could not work. Or the current owner is not
+ * willing to wait (zero timeout). Or the console does
+ * not enter the safe state before timeout passed. The
+ * caller might still use the unsafe hostile takeover
+ * when allowed.
+ *
+ * -EAGAIN: @cur has changed when creating the handover request.
+ * The caller should retry with direct acquire.
+ */
+static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt,
+ struct nbcon_state *cur)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state new;
+ int timeout;
+ int request_err = -EBUSY;
+
+ /*
+ * Check that the handover is called when the direct acquire failed
+ * with -EBUSY.
+ */
+ WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio);
+ WARN_ON_ONCE(!cur->unsafe);
+
+ /* Handover is not possible on the same CPU. */
+ if (cur->cpu == cpu)
+ return -EBUSY;
+
+ /*
+ * Console stays unsafe after an unsafe takeover until re-initialized.
+ * Waiting is not going to help in this case.
+ */
+ if (cur->unsafe_takeover)
+ return -EBUSY;
+
+ /* Is the caller willing to wait? */
+ if (ctxt->spinwait_max_us == 0)
+ return -EBUSY;
+
+ /*
+ * Setup a request for the handover. The caller should try to acquire
+ * the console directly when the current state has been modified.
+ */
+ new.atom = cur->atom;
+ new.req_prio = ctxt->prio;
+ if (!nbcon_state_try_cmpxchg(con, cur, &new))
+ return -EAGAIN;
+
+ cur->atom = new.atom;
+
+ /* Wait until there is no owner and then acquire the console. */
+ for (timeout = ctxt->spinwait_max_us; timeout >= 0; timeout--) {
+ /* On successful acquire, this request is cleared. */
+ request_err = nbcon_context_try_acquire_requested(ctxt, cur);
+ if (!request_err)
+ return 0;
+
+ /*
+ * If the acquire should be aborted, it must be ensured
+ * that the request is removed before returning to caller.
+ */
+ if (request_err == -EPERM)
+ break;
+
+ udelay(1);
+
+ /* Re-read the state because some time has passed. */
+ nbcon_state_read(con, cur);
+ }
+
+ /* Timed out or aborted. Carefully remove handover request. */
+ do {
+ /*
+ * No need to remove request if there is a new waiter. This
+ * can only happen if a higher priority context has taken over
+ * the console or the handover request.
+ */
+ if (!nbcon_waiter_matches(cur, ctxt->prio))
+ return -EPERM;
+
+ /* Unset request for handover. */
+ new.atom = cur->atom;
+ new.req_prio = NBCON_PRIO_NONE;
+ if (nbcon_state_try_cmpxchg(con, cur, &new)) {
+ /*
+ * Request successfully unset. Report failure of
+ * acquiring via handover.
+ */
+ cur->atom = new.atom;
+ return request_err;
+ }
+
+ /*
+ * Unable to remove request. Try to acquire in case
+ * the owner has released the lock.
+ */
+ } while (nbcon_context_try_acquire_requested(ctxt, cur));
+
+ /* Lucky timing. The acquire succeeded while removing the request. */
+ return 0;
+}
+
+/**
+ * nbcon_context_try_acquire_hostile - Acquire via unsafe hostile takeover
+ * @ctxt: The context of the caller
+ * @cur: The current console state
+ *
+ * Acquire the console even in the unsafe state.
+ *
+ * It can be permitted by setting the 'allow_unsafe_takeover' field only
+ * by the final attempt to flush messages in panic().
+ *
+ * Return: 0 on success. -EPERM when not allowed by the context.
+ */
+static int nbcon_context_try_acquire_hostile(struct nbcon_context *ctxt,
+ struct nbcon_state *cur)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state new;
+
+ if (!ctxt->allow_unsafe_takeover)
+ return -EPERM;
+
+ /* Ensure caller is allowed to perform unsafe hostile takeovers. */
+ if (WARN_ON_ONCE(ctxt->prio != NBCON_PRIO_PANIC))
+ return -EPERM;
+
+ /*
+ * Check that try_acquire_direct() and try_acquire_handover() returned
+ * -EBUSY in the right situation.
+ */
+ WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio);
+ WARN_ON_ONCE(cur->unsafe != true);
+
+ do {
+ new.atom = cur->atom;
+ new.cpu = cpu;
+ new.prio = ctxt->prio;
+ new.unsafe |= cur->unsafe_takeover;
+ new.unsafe_takeover |= cur->unsafe;
+
+ } while (!nbcon_state_try_cmpxchg(con, cur, &new));
+
+ return 0;
+}
+
+static struct printk_buffers panic_nbcon_pbufs;
+
+/**
+ * nbcon_context_try_acquire - Try to acquire nbcon console
+ * @ctxt: The context of the caller
+ *
+ * Context: Under @ctxt->con->device_lock() or local_irq_save().
+ * Return: True if the console was acquired. False otherwise.
+ *
+ * If the caller allowed an unsafe hostile takeover, on success the
+ * caller should check the current console state to see if it is
+ * in an unsafe state. Otherwise, on success the caller may assume
+ * the console is not in an unsafe state.
+ */
+static bool nbcon_context_try_acquire(struct nbcon_context *ctxt)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state cur;
+ int err;
+
+ nbcon_state_read(con, &cur);
+try_again:
+ err = nbcon_context_try_acquire_direct(ctxt, &cur);
+ if (err != -EBUSY)
+ goto out;
+
+ err = nbcon_context_try_acquire_handover(ctxt, &cur);
+ if (err == -EAGAIN)
+ goto try_again;
+ if (err != -EBUSY)
+ goto out;
+
+ err = nbcon_context_try_acquire_hostile(ctxt, &cur);
+out:
+ if (err)
+ return false;
+
+ /* Acquire succeeded. */
+
+ /* Assign the appropriate buffer for this context. */
+ if (atomic_read(&panic_cpu) == cpu)
+ ctxt->pbufs = &panic_nbcon_pbufs;
+ else
+ ctxt->pbufs = con->pbufs;
+
+ /* Set the record sequence for this context to print. */
+ ctxt->seq = nbcon_seq_read(ctxt->console);
+
+ return true;
+}
+
+static bool nbcon_owner_matches(struct nbcon_state *cur, int expected_cpu,
+ int expected_prio)
+{
+ /*
+ * A similar function, nbcon_waiter_matches(), only deals with
+ * EMERGENCY and PANIC priorities. However, this function must also
+ * deal with the NORMAL priority, which requires additional checks
+ * and constraints.
+ *
+ * For the case where preemption and interrupts are disabled, it is
+ * enough to also verify that the owning CPU has not changed.
+ *
+ * For the case where preemption or interrupts are enabled, an
+ * external synchronization method *must* be used. In particular,
+ * the driver-specific locking mechanism used in device_lock()
+ * (including disabling migration) should be used. It prevents
+ * scenarios such as:
+ *
+ * 1. [Task A] owns a context with NBCON_PRIO_NORMAL on [CPU X] and
+ * is scheduled out.
+ * 2. Another context takes over the lock with NBCON_PRIO_EMERGENCY
+ * and releases it.
+ * 3. [Task B] acquires a context with NBCON_PRIO_NORMAL on [CPU X]
+ * and is scheduled out.
+ * 4. [Task A] gets running on [CPU X] and sees that the console is
+ * still owned by a task on [CPU X] with NBON_PRIO_NORMAL. Thus
+ * [Task A] thinks it is the owner when it is not.
+ */
+
+ if (cur->prio != expected_prio)
+ return false;
+
+ if (cur->cpu != expected_cpu)
+ return false;
+
+ return true;
+}
+
+/**
+ * nbcon_context_release - Release the console
+ * @ctxt: The nbcon context from nbcon_context_try_acquire()
+ */
+static void nbcon_context_release(struct nbcon_context *ctxt)
+{
+ unsigned int cpu = smp_processor_id();
+ struct console *con = ctxt->console;
+ struct nbcon_state cur;
+ struct nbcon_state new;
+
+ nbcon_state_read(con, &cur);
+
+ do {
+ if (!nbcon_owner_matches(&cur, cpu, ctxt->prio))
+ break;
+
+ new.atom = cur.atom;
+ new.prio = NBCON_PRIO_NONE;
+
+ /*
+ * If @unsafe_takeover is set, it is kept set so that
+ * the state remains permanently unsafe.
+ */
+ new.unsafe |= cur.unsafe_takeover;
+
+ } while (!nbcon_state_try_cmpxchg(con, &cur, &new));
+
+ ctxt->pbufs = NULL;
+}
+
+/**
+ * nbcon_context_can_proceed - Check whether ownership can proceed
+ * @ctxt: The nbcon context from nbcon_context_try_acquire()
+ * @cur: The current console state
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * Must be invoked when entering the unsafe state to make sure that it still
+ * owns the lock. Also must be invoked when exiting the unsafe context
+ * to eventually free the lock for a higher priority context which asked
+ * for the friendly handover.
+ *
+ * It can be called inside an unsafe section when the console is just
+ * temporary in safe state instead of exiting and entering the unsafe
+ * state.
+ *
+ * Also it can be called in the safe context before doing an expensive
+ * safe operation. It does not make sense to do the operation when
+ * a higher priority context took the lock.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+static bool nbcon_context_can_proceed(struct nbcon_context *ctxt, struct nbcon_state *cur)
+{
+ unsigned int cpu = smp_processor_id();
+
+ /* Make sure this context still owns the console. */
+ if (!nbcon_owner_matches(cur, cpu, ctxt->prio))
+ return false;
+
+ /* The console owner can proceed if there is no waiter. */
+ if (cur->req_prio == NBCON_PRIO_NONE)
+ return true;
+
+ /*
+ * A console owner within an unsafe region is always allowed to
+ * proceed, even if there are waiters. It can perform a handover
+ * when exiting the unsafe region. Otherwise the waiter will
+ * need to perform an unsafe hostile takeover.
+ */
+ if (cur->unsafe)
+ return true;
+
+ /* Waiters always have higher priorities than owners. */
+ WARN_ON_ONCE(cur->req_prio <= cur->prio);
+
+ /*
+ * Having a safe point for take over and eventually a few
+ * duplicated characters or a full line is way better than a
+ * hostile takeover. Post processing can take care of the garbage.
+ * Release and hand over.
+ */
+ nbcon_context_release(ctxt);
+
+ /*
+ * It is not clear whether the waiter really took over ownership. The
+ * outermost callsite must make the final decision whether console
+ * ownership is needed for it to proceed. If yes, it must reacquire
+ * ownership (possibly hostile) before carefully proceeding.
+ *
+ * The calling context no longer owns the console so go back all the
+ * way instead of trying to implement reacquire heuristics in tons of
+ * places.
+ */
+ return false;
+}
+
+/**
+ * nbcon_can_proceed - Check whether ownership can proceed
+ * @wctxt: The write context that was handed to the write function
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * It is used in nbcon_enter_unsafe() to make sure that it still owns the
+ * lock. Also it is used in nbcon_exit_unsafe() to eventually free the lock
+ * for a higher priority context which asked for the friendly handover.
+ *
+ * It can be called inside an unsafe section when the console is just
+ * temporary in safe state instead of exiting and entering the unsafe state.
+ *
+ * Also it can be called in the safe context before doing an expensive safe
+ * operation. It does not make sense to do the operation when a higher
+ * priority context took the lock.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+bool nbcon_can_proceed(struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ struct console *con = ctxt->console;
+ struct nbcon_state cur;
+
+ nbcon_state_read(con, &cur);
+
+ return nbcon_context_can_proceed(ctxt, &cur);
+}
+EXPORT_SYMBOL_GPL(nbcon_can_proceed);
+
+#define nbcon_context_enter_unsafe(c) __nbcon_context_update_unsafe(c, true)
+#define nbcon_context_exit_unsafe(c) __nbcon_context_update_unsafe(c, false)
+
+/**
+ * __nbcon_context_update_unsafe - Update the unsafe bit in @con->nbcon_state
+ * @ctxt: The nbcon context from nbcon_context_try_acquire()
+ * @unsafe: The new value for the unsafe bit
+ *
+ * Return: True if the unsafe state was updated and this context still
+ * owns the console. Otherwise false if ownership was handed
+ * over or taken.
+ *
+ * This function allows console owners to modify the unsafe status of the
+ * console.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ *
+ * Internal helper to avoid duplicated code.
+ */
+static bool __nbcon_context_update_unsafe(struct nbcon_context *ctxt, bool unsafe)
+{
+ struct console *con = ctxt->console;
+ struct nbcon_state cur;
+ struct nbcon_state new;
+
+ nbcon_state_read(con, &cur);
+
+ do {
+ /*
+ * The unsafe bit must not be cleared if an
+ * unsafe hostile takeover has occurred.
+ */
+ if (!unsafe && cur.unsafe_takeover)
+ goto out;
+
+ if (!nbcon_context_can_proceed(ctxt, &cur))
+ return false;
+
+ new.atom = cur.atom;
+ new.unsafe = unsafe;
+ } while (!nbcon_state_try_cmpxchg(con, &cur, &new));
+
+ cur.atom = new.atom;
+out:
+ return nbcon_context_can_proceed(ctxt, &cur);
+}
+
+static void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt,
+ char *buf, unsigned int len)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ struct console *con = ctxt->console;
+ struct nbcon_state cur;
+
+ wctxt->outbuf = buf;
+ wctxt->len = len;
+ nbcon_state_read(con, &cur);
+ wctxt->unsafe_takeover = cur.unsafe_takeover;
+}
+
+/**
+ * nbcon_enter_unsafe - Enter an unsafe region in the driver
+ * @wctxt: The write context that was handed to the write function
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ bool is_owner;
+
+ is_owner = nbcon_context_enter_unsafe(ctxt);
+ if (!is_owner)
+ nbcon_write_context_set_buf(wctxt, NULL, 0);
+ return is_owner;
+}
+EXPORT_SYMBOL_GPL(nbcon_enter_unsafe);
+
+/**
+ * nbcon_exit_unsafe - Exit an unsafe region in the driver
+ * @wctxt: The write context that was handed to the write function
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context.
+ */
+bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ bool ret;
+
+ ret = nbcon_context_exit_unsafe(ctxt);
+ if (!ret)
+ nbcon_write_context_set_buf(wctxt, NULL, 0);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nbcon_exit_unsafe);
+
+/**
+ * nbcon_reacquire_nobuf - Reacquire a console after losing ownership
+ * while printing
+ * @wctxt: The write context that was handed to the write callback
+ *
+ * Since ownership can be lost at any time due to handover or takeover, a
+ * printing context _must_ be prepared to back out immediately and
+ * carefully. However, there are scenarios where the printing context must
+ * reacquire ownership in order to finalize or revert hardware changes.
+ *
+ * This function allows a printing context to reacquire ownership using the
+ * same priority as its previous ownership.
+ *
+ * Note that after a successful reacquire the printing context will have no
+ * output buffer because that has been lost. This function cannot be used to
+ * resume printing.
+ */
+void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+ while (!nbcon_context_try_acquire(ctxt))
+ cpu_relax();
+
+ nbcon_write_context_set_buf(wctxt, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(nbcon_reacquire_nobuf);
+
+/**
+ * nbcon_emit_next_record - Emit a record in the acquired context
+ * @wctxt: The write context that will be handed to the write function
+ * @use_atomic: True if the write_atomic() callback is to be used
+ *
+ * Return: True if this context still owns the console. False if
+ * ownership was handed over or taken.
+ *
+ * When this function returns false then the calling context no longer owns
+ * the console and is no longer allowed to go forward. In this case it must
+ * back out immediately and carefully. The buffer content is also no longer
+ * trusted since it no longer belongs to the calling context. If the caller
+ * wants to do more it must reacquire the console first.
+ *
+ * When true is returned, @wctxt->ctxt.backlog indicates whether there are
+ * still records pending in the ringbuffer,
+ */
+static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt, bool use_atomic)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ struct console *con = ctxt->console;
+ bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
+ struct printk_message pmsg = {
+ .pbufs = ctxt->pbufs,
+ };
+ unsigned long con_dropped;
+ struct nbcon_state cur;
+ unsigned long dropped;
+ unsigned long ulseq;
+
+ /*
+ * This function should never be called for consoles that have not
+ * implemented the necessary callback for writing: i.e. legacy
+ * consoles and, when atomic, nbcon consoles with no write_atomic().
+ * Handle it as if ownership was lost and try to continue.
+ *
+ * Note that for nbcon consoles the write_thread() callback is
+ * mandatory and was already checked in nbcon_alloc().
+ */
+ if (WARN_ON_ONCE((use_atomic && !con->write_atomic) ||
+ !(console_srcu_read_flags(con) & CON_NBCON))) {
+ nbcon_context_release(ctxt);
+ return false;
+ }
+
+ /*
+ * The printk buffers are filled within an unsafe section. This
+ * prevents NBCON_PRIO_NORMAL and NBCON_PRIO_EMERGENCY from
+ * clobbering each other.
+ */
+
+ if (!nbcon_context_enter_unsafe(ctxt))
+ return false;
+
+ ctxt->backlog = printk_get_next_message(&pmsg, ctxt->seq, is_extended, true);
+ if (!ctxt->backlog)
+ return nbcon_context_exit_unsafe(ctxt);
+
+ /*
+ * @con->dropped is not protected in case of an unsafe hostile
+ * takeover. In that situation the update can be racy so
+ * annotate it accordingly.
+ */
+ con_dropped = data_race(READ_ONCE(con->dropped));
+
+ dropped = con_dropped + pmsg.dropped;
+ if (dropped && !is_extended)
+ console_prepend_dropped(&pmsg, dropped);
+
+ /*
+ * If the previous owner was assigned the same record, this context
+ * has taken over ownership and is replaying the record. Prepend a
+ * message to let the user know the record is replayed.
+ */
+ ulseq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_prev_seq));
+ if (__ulseq_to_u64seq(prb, ulseq) == pmsg.seq) {
+ console_prepend_replay(&pmsg);
+ } else {
+ /*
+ * Ensure this context is still the owner before trying to
+ * update @nbcon_prev_seq. Otherwise the value in @ulseq may
+ * not be from the previous owner and instead be some later
+ * value from the context that took over ownership.
+ */
+ nbcon_state_read(con, &cur);
+ if (!nbcon_context_can_proceed(ctxt, &cur))
+ return false;
+
+ atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_prev_seq), &ulseq,
+ __u64seq_to_ulseq(pmsg.seq));
+ }
+
+ if (!nbcon_context_exit_unsafe(ctxt))
+ return false;
+
+ /* For skipped records just update seq/dropped in @con. */
+ if (pmsg.outbuf_len == 0)
+ goto update_con;
+
+ /* Initialize the write context for driver callbacks. */
+ nbcon_write_context_set_buf(wctxt, &pmsg.pbufs->outbuf[0], pmsg.outbuf_len);
+
+ if (use_atomic)
+ con->write_atomic(con, wctxt);
+ else
+ con->write_thread(con, wctxt);
+
+ if (!wctxt->outbuf) {
+ /*
+ * Ownership was lost and reacquired by the driver. Handle it
+ * as if ownership was lost.
+ */
+ nbcon_context_release(ctxt);
+ return false;
+ }
+
+ /*
+ * Ownership may have been lost but _not_ reacquired by the driver.
+ * This case is detected and handled when entering unsafe to update
+ * dropped/seq values.
+ */
+
+ /*
+ * Since any dropped message was successfully output, reset the
+ * dropped count for the console.
+ */
+ dropped = 0;
+update_con:
+ /*
+ * The dropped count and the sequence number are updated within an
+ * unsafe section. This limits update races to the panic context and
+ * allows the panic context to win.
+ */
+
+ if (!nbcon_context_enter_unsafe(ctxt))
+ return false;
+
+ if (dropped != con_dropped) {
+ /* Counterpart to the READ_ONCE() above. */
+ WRITE_ONCE(con->dropped, dropped);
+ }
+
+ nbcon_seq_try_update(ctxt, pmsg.seq + 1);
+
+ return nbcon_context_exit_unsafe(ctxt);
+}
+
+/*
+ * nbcon_emit_one - Print one record for an nbcon console using the
+ * specified callback
+ * @wctxt: An initialized write context struct to use for this context
+ * @use_atomic: True if the write_atomic() callback is to be used
+ *
+ * Return: True, when a record has been printed and there are still
+ * pending records. The caller might want to continue flushing.
+ *
+ * False, when there is no pending record, or when the console
+ * context cannot be acquired, or the ownership has been lost.
+ * The caller should give up. Either the job is done, cannot be
+ * done, or will be handled by the owning context.
+ *
+ * This is an internal helper to handle the locking of the console before
+ * calling nbcon_emit_next_record().
+ */
+static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+ struct console *con = ctxt->console;
+ unsigned long flags;
+ bool ret = false;
+
+ if (!use_atomic) {
+ con->device_lock(con, &flags);
+
+ /*
+ * Ensure this stays on the CPU to make handover and
+ * takeover possible.
+ */
+ cant_migrate();
+ }
+
+ if (!nbcon_context_try_acquire(ctxt))
+ goto out;
+
+ /*
+ * nbcon_emit_next_record() returns false when the console was
+ * handed over or taken over. In both cases the context is no
+ * longer valid.
+ *
+ * The higher priority printing context takes over responsibility
+ * to print the pending records.
+ */
+ if (!nbcon_emit_next_record(wctxt, use_atomic))
+ goto out;
+
+ nbcon_context_release(ctxt);
+
+ ret = ctxt->backlog;
+out:
+ if (!use_atomic)
+ con->device_unlock(con, flags);
+ return ret;
+}
+
+/**
+ * nbcon_kthread_should_wakeup - Check whether a printer thread should wakeup
+ * @con: Console to operate on
+ * @ctxt: The nbcon context from nbcon_context_try_acquire()
+ *
+ * Return: True if the thread should shutdown or if the console is
+ * allowed to print and a record is available. False otherwise.
+ *
+ * After the thread wakes up, it must first check if it should shutdown before
+ * attempting any printing.
+ */
+static bool nbcon_kthread_should_wakeup(struct console *con, struct nbcon_context *ctxt)
+{
+ bool ret = false;
+ short flags;
+ int cookie;
+
+ if (kthread_should_stop())
+ return true;
+
+ cookie = console_srcu_read_lock();
+
+ flags = console_srcu_read_flags(con);
+ if (console_is_usable(con, flags, false)) {
+ /* Bring the sequence in @ctxt up to date */
+ ctxt->seq = nbcon_seq_read(con);
+
+ ret = prb_read_valid(prb, ctxt->seq, NULL);
+ }
+
+ console_srcu_read_unlock(cookie);
+ return ret;
+}
+
+/**
+ * nbcon_kthread_func - The printer thread function
+ * @__console: Console to operate on
+ *
+ * Return: 0
+ */
+static int nbcon_kthread_func(void *__console)
+{
+ struct console *con = __console;
+ struct nbcon_write_context wctxt = {
+ .ctxt.console = con,
+ .ctxt.prio = NBCON_PRIO_NORMAL,
+ };
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
+ short con_flags;
+ bool backlog;
+ int cookie;
+
+wait_for_event:
+ /*
+ * Guarantee this task is visible on the rcuwait before
+ * checking the wake condition.
+ *
+ * The full memory barrier within set_current_state() of
+ * ___rcuwait_wait_event() pairs with the full memory
+ * barrier within rcuwait_has_sleeper().
+ *
+ * This pairs with rcuwait_has_sleeper:A and nbcon_kthread_wake:A.
+ */
+ rcuwait_wait_event(&con->rcuwait,
+ nbcon_kthread_should_wakeup(con, ctxt),
+ TASK_INTERRUPTIBLE); /* LMM(nbcon_kthread_func:A) */
+
+ do {
+ if (kthread_should_stop())
+ return 0;
+
+ backlog = false;
+
+ /*
+ * Keep the srcu read lock around the entire operation so that
+ * synchronize_srcu() can guarantee that the kthread stopped
+ * or suspended printing.
+ */
+ cookie = console_srcu_read_lock();
+
+ con_flags = console_srcu_read_flags(con);
+
+ if (console_is_usable(con, con_flags, false))
+ backlog = nbcon_emit_one(&wctxt, false);
+
+ console_srcu_read_unlock(cookie);
+
+ cond_resched();
+
+ } while (backlog);
+
+ goto wait_for_event;
+}
+
+/**
+ * nbcon_irq_work - irq work to wake console printer thread
+ * @irq_work: The irq work to operate on
+ */
+static void nbcon_irq_work(struct irq_work *irq_work)
+{
+ struct console *con = container_of(irq_work, struct console, irq_work);
+
+ nbcon_kthread_wake(con);
+}
+
+static inline bool rcuwait_has_sleeper(struct rcuwait *w)
+{
+ /*
+ * Guarantee any new records can be seen by tasks preparing to wait
+ * before this context checks if the rcuwait is empty.
+ *
+ * This full memory barrier pairs with the full memory barrier within
+ * set_current_state() of ___rcuwait_wait_event(), which is called
+ * after prepare_to_rcuwait() adds the waiter but before it has
+ * checked the wait condition.
+ *
+ * This pairs with nbcon_kthread_func:A.
+ */
+ smp_mb(); /* LMM(rcuwait_has_sleeper:A) */
+ return rcuwait_active(w);
+}
+
+/**
+ * nbcon_kthreads_wake - Wake up printing threads using irq_work
+ */
+void nbcon_kthreads_wake(void)
+{
+ struct console *con;
+ int cookie;
+
+ if (!printk_kthreads_running)
+ return;
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(con) {
+ if (!(console_srcu_read_flags(con) & CON_NBCON))
+ continue;
+
+ /*
+ * Only schedule irq_work if the printing thread is
+ * actively waiting. If not waiting, the thread will
+ * notice by itself that it has work to do.
+ */
+ if (rcuwait_has_sleeper(&con->rcuwait))
+ irq_work_queue(&con->irq_work);
+ }
+ console_srcu_read_unlock(cookie);
+}
+
+/*
+ * nbcon_kthread_stop - Stop a console printer thread
+ * @con: Console to operate on
+ */
+void nbcon_kthread_stop(struct console *con)
+{
+ lockdep_assert_console_list_lock_held();
+
+ if (!con->kthread)
+ return;
+
+ kthread_stop(con->kthread);
+ con->kthread = NULL;
+}
+
+/**
+ * nbcon_kthread_create - Create a console printer thread
+ * @con: Console to operate on
+ *
+ * Return: True if the kthread was started or already exists.
+ * Otherwise false and @con must not be registered.
+ *
+ * This function is called when it will be expected that nbcon consoles are
+ * flushed using the kthread. The messages printed with NBCON_PRIO_NORMAL
+ * will be no longer flushed by the legacy loop. This is why failure must
+ * be fatal for console registration.
+ *
+ * If @con was already registered and this function fails, @con must be
+ * unregistered before the global state variable @printk_kthreads_running
+ * can be set.
+ */
+bool nbcon_kthread_create(struct console *con)
+{
+ struct task_struct *kt;
+
+ lockdep_assert_console_list_lock_held();
+
+ if (con->kthread)
+ return true;
+
+ kt = kthread_run(nbcon_kthread_func, con, "pr/%s%d", con->name, con->index);
+ if (WARN_ON(IS_ERR(kt))) {
+ con_printk(KERN_ERR, con, "failed to start printing thread\n");
+ return false;
+ }
+
+ con->kthread = kt;
+
+ /*
+ * It is important that console printing threads are scheduled
+ * shortly after a printk call and with generous runtime budgets.
+ */
+ sched_set_normal(con->kthread, -20);
+
+ return true;
+}
+
+/* Track the nbcon emergency nesting per CPU. */
+static DEFINE_PER_CPU(unsigned int, nbcon_pcpu_emergency_nesting);
+static unsigned int early_nbcon_pcpu_emergency_nesting __initdata;
+
+/**
+ * nbcon_get_cpu_emergency_nesting - Get the per CPU emergency nesting pointer
+ *
+ * Context: For reading, any context. For writing, any context which could
+ * not be migrated to another CPU.
+ * Return: Either a pointer to the per CPU emergency nesting counter of
+ * the current CPU or to the init data during early boot.
+ *
+ * The function is safe for reading per-CPU variables in any context because
+ * preemption is disabled if the current CPU is in the emergency state. See
+ * also nbcon_cpu_emergency_enter().
+ */
+static __ref unsigned int *nbcon_get_cpu_emergency_nesting(void)
+{
+ /*
+ * The value of __printk_percpu_data_ready gets set in normal
+ * context and before SMP initialization. As a result it could
+ * never change while inside an nbcon emergency section.
+ */
+ if (!printk_percpu_data_ready())
+ return &early_nbcon_pcpu_emergency_nesting;
+
+ return raw_cpu_ptr(&nbcon_pcpu_emergency_nesting);
+}
+
+/**
+ * nbcon_get_default_prio - The appropriate nbcon priority to use for nbcon
+ * printing on the current CPU
+ *
+ * Context: Any context.
+ * Return: The nbcon_prio to use for acquiring an nbcon console in this
+ * context for printing.
+ *
+ * The function is safe for reading per-CPU data in any context because
+ * preemption is disabled if the current CPU is in the emergency or panic
+ * state.
+ */
+enum nbcon_prio nbcon_get_default_prio(void)
+{
+ unsigned int *cpu_emergency_nesting;
+
+ if (this_cpu_in_panic())
+ return NBCON_PRIO_PANIC;
+
+ cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
+ if (*cpu_emergency_nesting)
+ return NBCON_PRIO_EMERGENCY;
+
+ return NBCON_PRIO_NORMAL;
+}
+
+/**
+ * nbcon_legacy_emit_next_record - Print one record for an nbcon console
+ * in legacy contexts
+ * @con: The console to print on
+ * @handover: Will be set to true if a printk waiter has taken over the
+ * console_lock, in which case the caller is no longer holding
+ * both the console_lock and the SRCU read lock. Otherwise it
+ * is set to false.
+ * @cookie: The cookie from the SRCU read lock.
+ * @use_atomic: Set true when called in an atomic or unknown context.
+ * It affects which nbcon callback will be used: write_atomic()
+ * or write_thread().
+ *
+ * When false, the write_thread() callback is used and would be
+ * called in a preemtible context unless disabled by the
+ * device_lock. The legacy handover is not allowed in this mode.
+ *
+ * Context: Any context except NMI.
+ * Return: True, when a record has been printed and there are still
+ * pending records. The caller might want to continue flushing.
+ *
+ * False, when there is no pending record, or when the console
+ * context cannot be acquired, or the ownership has been lost.
+ * The caller should give up. Either the job is done, cannot be
+ * done, or will be handled by the owning context.
+ *
+ * This function is meant to be called by console_flush_all() to print records
+ * on nbcon consoles from legacy context (printing via console unlocking).
+ * Essentially it is the nbcon version of console_emit_next_record().
+ */
+bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
+ int cookie, bool use_atomic)
+{
+ struct nbcon_write_context wctxt = { };
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
+ unsigned long flags;
+ bool progress;
+
+ ctxt->console = con;
+ ctxt->prio = nbcon_get_default_prio();
+
+ if (use_atomic) {
+ /*
+ * In an atomic or unknown context, use the same procedure as
+ * in console_emit_next_record(). It allows to handover.
+ */
+ printk_safe_enter_irqsave(flags);
+ console_lock_spinning_enable();
+ stop_critical_timings();
+ }
+
+ progress = nbcon_emit_one(&wctxt, use_atomic);
+
+ if (use_atomic) {
+ start_critical_timings();
+ *handover = console_lock_spinning_disable_and_check(cookie);
+ printk_safe_exit_irqrestore(flags);
+ } else {
+ /* Non-atomic does not perform legacy spinning handovers. */
+ *handover = false;
+ }
+
+ return progress;
+}
+
+/**
+ * __nbcon_atomic_flush_pending_con - Flush specified nbcon console using its
+ * write_atomic() callback
+ * @con: The nbcon console to flush
+ * @stop_seq: Flush up until this record
+ * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers
+ *
+ * Return: 0 if @con was flushed up to @stop_seq Otherwise, error code on
+ * failure.
+ *
+ * Errors:
+ *
+ * -EPERM: Unable to acquire console ownership.
+ *
+ * -EAGAIN: Another context took over ownership while printing.
+ *
+ * -ENOENT: A record before @stop_seq is not available.
+ *
+ * If flushing up to @stop_seq was not successful, it only makes sense for the
+ * caller to try again when -EAGAIN was returned. When -EPERM is returned,
+ * this context is not allowed to acquire the console. When -ENOENT is
+ * returned, it cannot be expected that the unfinalized record will become
+ * available.
+ */
+static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq,
+ bool allow_unsafe_takeover)
+{
+ struct nbcon_write_context wctxt = { };
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
+ int err = 0;
+
+ ctxt->console = con;
+ ctxt->spinwait_max_us = 2000;
+ ctxt->prio = nbcon_get_default_prio();
+ ctxt->allow_unsafe_takeover = allow_unsafe_takeover;
+
+ if (!nbcon_context_try_acquire(ctxt))
+ return -EPERM;
+
+ while (nbcon_seq_read(con) < stop_seq) {
+ /*
+ * nbcon_emit_next_record() returns false when the console was
+ * handed over or taken over. In both cases the context is no
+ * longer valid.
+ */
+ if (!nbcon_emit_next_record(&wctxt, true))
+ return -EAGAIN;
+
+ if (!ctxt->backlog) {
+ /* Are there reserved but not yet finalized records? */
+ if (nbcon_seq_read(con) < stop_seq)
+ err = -ENOENT;
+ break;
+ }
+ }
+
+ nbcon_context_release(ctxt);
+ return err;
+}
+
+/**
+ * nbcon_atomic_flush_pending_con - Flush specified nbcon console using its
+ * write_atomic() callback
+ * @con: The nbcon console to flush
+ * @stop_seq: Flush up until this record
+ * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers
+ *
+ * This will stop flushing before @stop_seq if another context has ownership.
+ * That context is then responsible for the flushing. Likewise, if new records
+ * are added while this context was flushing and there is no other context
+ * to handle the printing, this context must also flush those records.
+ */
+static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq,
+ bool allow_unsafe_takeover)
+{
+ struct console_flush_type ft;
+ unsigned long flags;
+ int err;
+
+again:
+ /*
+ * Atomic flushing does not use console driver synchronization (i.e.
+ * it does not hold the port lock for uart consoles). Therefore IRQs
+ * must be disabled to avoid being interrupted and then calling into
+ * a driver that will deadlock trying to acquire console ownership.
+ */
+ local_irq_save(flags);
+
+ err = __nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover);
+
+ local_irq_restore(flags);
+
+ /*
+ * If there was a new owner (-EPERM, -EAGAIN), that context is
+ * responsible for completing.
+ *
+ * Do not wait for records not yet finalized (-ENOENT) to avoid a
+ * possible deadlock. They will either get flushed by the writer or
+ * eventually skipped on panic CPU.
+ */
+ if (err)
+ return;
+
+ /*
+ * If flushing was successful but more records are available, this
+ * context must flush those remaining records if the printer thread
+ * is not available do it.
+ */
+ printk_get_console_flush_type(&ft);
+ if (!ft.nbcon_offload &&
+ prb_read_valid(prb, nbcon_seq_read(con), NULL)) {
+ stop_seq = prb_next_reserve_seq(prb);
+ goto again;
+ }
+}
+
+/**
+ * __nbcon_atomic_flush_pending - Flush all nbcon consoles using their
+ * write_atomic() callback
+ * @stop_seq: Flush up until this record
+ * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers
+ */
+static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeover)
+{
+ struct console *con;
+ int cookie;
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(con) {
+ short flags = console_srcu_read_flags(con);
+
+ if (!(flags & CON_NBCON))
+ continue;
+
+ if (!console_is_usable(con, flags, true))
+ continue;
+
+ if (nbcon_seq_read(con) >= stop_seq)
+ continue;
+
+ nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover);
+ }
+ console_srcu_read_unlock(cookie);
+}
+
+/**
+ * nbcon_atomic_flush_pending - Flush all nbcon consoles using their
+ * write_atomic() callback
+ *
+ * Flush the backlog up through the currently newest record. Any new
+ * records added while flushing will not be flushed if there is another
+ * context available to handle the flushing. This is to avoid one CPU
+ * printing unbounded because other CPUs continue to add records.
+ */
+void nbcon_atomic_flush_pending(void)
+{
+ __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), false);
+}
+
+/**
+ * nbcon_atomic_flush_unsafe - Flush all nbcon consoles using their
+ * write_atomic() callback and allowing unsafe hostile takeovers
+ *
+ * Flush the backlog up through the currently newest record. Unsafe hostile
+ * takeovers will be performed, if necessary.
+ */
+void nbcon_atomic_flush_unsafe(void)
+{
+ __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), true);
+}
+
+/**
+ * nbcon_cpu_emergency_enter - Enter an emergency section where printk()
+ * messages for that CPU are flushed directly
+ *
+ * Context: Any context. Disables preemption.
+ *
+ * When within an emergency section, printk() calls will attempt to flush any
+ * pending messages in the ringbuffer.
+ */
+void nbcon_cpu_emergency_enter(void)
+{
+ unsigned int *cpu_emergency_nesting;
+
+ preempt_disable();
+
+ cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
+ (*cpu_emergency_nesting)++;
+}
+
+/**
+ * nbcon_cpu_emergency_exit - Exit an emergency section
+ *
+ * Context: Within an emergency section. Enables preemption.
+ */
+void nbcon_cpu_emergency_exit(void)
+{
+ unsigned int *cpu_emergency_nesting;
+
+ cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
+
+ if (!WARN_ON_ONCE(*cpu_emergency_nesting == 0))
+ (*cpu_emergency_nesting)--;
+
+ preempt_enable();
+}
+
+/**
+ * nbcon_alloc - Allocate and init the nbcon console specific data
+ * @con: Console to initialize
+ *
+ * Return: True if the console was fully allocated and initialized.
+ * Otherwise @con must not be registered.
+ *
+ * When allocation and init was successful, the console must be properly
+ * freed using nbcon_free() once it is no longer needed.
+ */
+bool nbcon_alloc(struct console *con)
+{
+ struct nbcon_state state = { };
+
+ /* The write_thread() callback is mandatory. */
+ if (WARN_ON(!con->write_thread))
+ return false;
+
+ rcuwait_init(&con->rcuwait);
+ init_irq_work(&con->irq_work, nbcon_irq_work);
+ atomic_long_set(&ACCESS_PRIVATE(con, nbcon_prev_seq), -1UL);
+ nbcon_state_set(con, &state);
+
+ /*
+ * Initialize @nbcon_seq to the highest possible sequence number so
+ * that practically speaking it will have nothing to print until a
+ * desired initial sequence number has been set via nbcon_seq_force().
+ */
+ atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), ULSEQ_MAX(prb));
+
+ if (con->flags & CON_BOOT) {
+ /*
+ * Boot console printing is synchronized with legacy console
+ * printing, so boot consoles can share the same global printk
+ * buffers.
+ */
+ con->pbufs = &printk_shared_pbufs;
+ } else {
+ con->pbufs = kmalloc(sizeof(*con->pbufs), GFP_KERNEL);
+ if (!con->pbufs) {
+ con_printk(KERN_ERR, con, "failed to allocate printing buffer\n");
+ return false;
+ }
+
+ if (printk_kthreads_running) {
+ if (!nbcon_kthread_create(con)) {
+ kfree(con->pbufs);
+ con->pbufs = NULL;
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+/**
+ * nbcon_free - Free and cleanup the nbcon console specific data
+ * @con: Console to free/cleanup nbcon data
+ */
+void nbcon_free(struct console *con)
+{
+ struct nbcon_state state = { };
+
+ if (printk_kthreads_running)
+ nbcon_kthread_stop(con);
+
+ nbcon_state_set(con, &state);
+
+ /* Boot consoles share global printk buffers. */
+ if (!(con->flags & CON_BOOT))
+ kfree(con->pbufs);
+
+ con->pbufs = NULL;
+}
+
+/**
+ * nbcon_device_try_acquire - Try to acquire nbcon console and enter unsafe
+ * section
+ * @con: The nbcon console to acquire
+ *
+ * Context: Under the locking mechanism implemented in
+ * @con->device_lock() including disabling migration.
+ * Return: True if the console was acquired. False otherwise.
+ *
+ * Console drivers will usually use their own internal synchronization
+ * mechasism to synchronize between console printing and non-printing
+ * activities (such as setting baud rates). However, nbcon console drivers
+ * supporting atomic consoles may also want to mark unsafe sections when
+ * performing non-printing activities in order to synchronize against their
+ * atomic_write() callback.
+ *
+ * This function acquires the nbcon console using priority NBCON_PRIO_NORMAL
+ * and marks it unsafe for handover/takeover.
+ */
+bool nbcon_device_try_acquire(struct console *con)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt);
+
+ cant_migrate();
+
+ memset(ctxt, 0, sizeof(*ctxt));
+ ctxt->console = con;
+ ctxt->prio = NBCON_PRIO_NORMAL;
+
+ if (!nbcon_context_try_acquire(ctxt))
+ return false;
+
+ if (!nbcon_context_enter_unsafe(ctxt))
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(nbcon_device_try_acquire);
+
+/**
+ * nbcon_device_release - Exit unsafe section and release the nbcon console
+ * @con: The nbcon console acquired in nbcon_device_try_acquire()
+ */
+void nbcon_device_release(struct console *con)
+{
+ struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt);
+ struct console_flush_type ft;
+ int cookie;
+
+ if (!nbcon_context_exit_unsafe(ctxt))
+ return;
+
+ nbcon_context_release(ctxt);
+
+ /*
+ * This context must flush any new records added while the console
+ * was locked if the printer thread is not available to do it. The
+ * console_srcu_read_lock must be taken to ensure the console is
+ * usable throughout flushing.
+ */
+ cookie = console_srcu_read_lock();
+ printk_get_console_flush_type(&ft);
+ if (console_is_usable(con, console_srcu_read_flags(con), true) &&
+ !ft.nbcon_offload &&
+ prb_read_valid(prb, nbcon_seq_read(con), NULL)) {
+ /*
+ * If nbcon_atomic flushing is not available, fallback to
+ * using the legacy loop.
+ */
+ if (ft.nbcon_atomic) {
+ __nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb), false);
+ } else if (ft.legacy_direct) {
+ if (console_trylock())
+ console_unlock();
+ } else if (ft.legacy_offload) {
+ printk_trigger_flush();
+ }
+ }
+ console_srcu_read_unlock(cookie);
+}
+EXPORT_SYMBOL_GPL(nbcon_device_release);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b71eaf5f5a86..07668433644b 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -34,7 +34,8 @@
#include <linux/security.h>
#include <linux/memblock.h>
#include <linux/syscalls.h>
-#include <linux/crash_core.h>
+#include <linux/syscore_ops.h>
+#include <linux/vmcore_info.h>
#include <linux/ratelimit.h>
#include <linux/kmsg_dump.h>
#include <linux/syslog.h>
@@ -55,6 +56,7 @@
#define CREATE_TRACE_POINTS
#include <trace/events/printk.h>
+#include "printk_ringbuffer.h"
#include "console_cmdline.h"
#include "braille.h"
#include "internal.h"
@@ -70,6 +72,8 @@ EXPORT_SYMBOL_GPL(console_printk);
atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0);
EXPORT_SYMBOL(ignore_console_lock_warning);
+EXPORT_TRACEPOINT_SYMBOL_GPL(console);
+
/*
* Low level drivers may need that to know if they can schedule in
* their unblank() callback or not. So let's export it.
@@ -78,13 +82,20 @@ int oops_in_progress;
EXPORT_SYMBOL(oops_in_progress);
/*
- * console_sem protects the console_drivers list, and also
- * provides serialisation for access to the entire console
- * driver system.
+ * console_mutex protects console_list updates and console->flags updates.
+ * The flags are synchronized only for consoles that are registered, i.e.
+ * accessible via the console list.
+ */
+static DEFINE_MUTEX(console_mutex);
+
+/*
+ * console_sem protects updates to console->seq
+ * and also provides serialization for console printing.
*/
-static DEFINE_SEMAPHORE(console_sem);
-struct console *console_drivers;
-EXPORT_SYMBOL_GPL(console_drivers);
+static DEFINE_SEMAPHORE(console_sem, 1);
+HLIST_HEAD(console_list);
+EXPORT_SYMBOL_GPL(console_list);
+DEFINE_STATIC_SRCU(console_srcu);
/*
* System may need to suppress printk message under certain
@@ -96,6 +107,20 @@ int __read_mostly suppress_printk;
static struct lockdep_map console_lock_dep_map = {
.name = "console_lock"
};
+
+void lockdep_assert_console_list_lock_held(void)
+{
+ lockdep_assert_held(&console_mutex);
+}
+EXPORT_SYMBOL(lockdep_assert_console_list_lock_held);
+#endif
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+bool console_srcu_read_lock_is_held(void)
+{
+ return srcu_read_lock_held(&console_srcu);
+}
+EXPORT_SYMBOL(console_srcu_read_lock_is_held);
#endif
enum devkmsg_log_bits {
@@ -145,16 +170,18 @@ static int __control_devkmsg(char *str)
static int __init control_devkmsg(char *str)
{
- if (__control_devkmsg(str) < 0)
+ if (__control_devkmsg(str) < 0) {
+ pr_warn("printk.devkmsg: bad option string '%s'\n", str);
return 1;
+ }
/*
* Set sysctl string accordingly:
*/
if (devkmsg_log == DEVKMSG_LOG_MASK_ON)
- strcpy(devkmsg_log_str, "on");
+ strscpy(devkmsg_log_str, "on");
else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF)
- strcpy(devkmsg_log_str, "off");
+ strscpy(devkmsg_log_str, "off");
/* else "ratelimit" which is set by default. */
/*
@@ -165,13 +192,13 @@ static int __init control_devkmsg(char *str)
*/
devkmsg_log |= DEVKMSG_LOG_MASK_LOCK;
- return 0;
+ return 1;
}
__setup("printk.devkmsg=", control_devkmsg);
char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit";
-
-int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
+#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
+int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
char old_str[DEVKMSG_STR_MAX_SIZE];
@@ -183,7 +210,7 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
return -EINVAL;
old = devkmsg_log;
- strncpy(old_str, devkmsg_log_str, DEVKMSG_STR_MAX_SIZE);
+ strscpy(old_str, devkmsg_log_str);
}
err = proc_dostring(table, write, buffer, lenp, ppos);
@@ -201,7 +228,7 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
/* ... and restore old setting. */
devkmsg_log = old;
- strncpy(devkmsg_log_str, old_str, DEVKMSG_STR_MAX_SIZE);
+ strscpy(devkmsg_log_str, old_str);
return -EINVAL;
}
@@ -209,9 +236,72 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
return 0;
}
+#endif /* CONFIG_PRINTK && CONFIG_SYSCTL */
-/* Number of registered extended console drivers. */
-static int nr_ext_console_drivers;
+/**
+ * console_list_lock - Lock the console list
+ *
+ * For console list or console->flags updates
+ */
+void console_list_lock(void)
+{
+ /*
+ * In unregister_console() and console_force_preferred_locked(),
+ * synchronize_srcu() is called with the console_list_lock held.
+ * Therefore it is not allowed that the console_list_lock is taken
+ * with the srcu_lock held.
+ *
+ * Detecting if this context is really in the read-side critical
+ * section is only possible if the appropriate debug options are
+ * enabled.
+ */
+ WARN_ON_ONCE(debug_lockdep_rcu_enabled() &&
+ srcu_read_lock_held(&console_srcu));
+
+ mutex_lock(&console_mutex);
+}
+EXPORT_SYMBOL(console_list_lock);
+
+/**
+ * console_list_unlock - Unlock the console list
+ *
+ * Counterpart to console_list_lock()
+ */
+void console_list_unlock(void)
+{
+ mutex_unlock(&console_mutex);
+}
+EXPORT_SYMBOL(console_list_unlock);
+
+/**
+ * console_srcu_read_lock - Register a new reader for the
+ * SRCU-protected console list
+ *
+ * Use for_each_console_srcu() to iterate the console list
+ *
+ * Context: Any context.
+ * Return: A cookie to pass to console_srcu_read_unlock().
+ */
+int console_srcu_read_lock(void)
+ __acquires(&console_srcu)
+{
+ return srcu_read_lock_nmisafe(&console_srcu);
+}
+EXPORT_SYMBOL(console_srcu_read_lock);
+
+/**
+ * console_srcu_read_unlock - Unregister an old reader from
+ * the SRCU-protected console list
+ * @cookie: cookie returned from console_srcu_read_lock()
+ *
+ * Counterpart to console_srcu_read_lock()
+ */
+void console_srcu_read_unlock(int cookie)
+ __releases(&console_srcu)
+{
+ srcu_read_unlock_nmisafe(&console_srcu, cookie);
+}
+EXPORT_SYMBOL(console_srcu_read_unlock);
/*
* Helper macros to handle lockdep when locking/unlocking console_sem. We use
@@ -255,20 +345,43 @@ static void __up_console_sem(unsigned long ip)
}
#define up_console_sem() __up_console_sem(_RET_IP_)
+static bool panic_in_progress(void)
+{
+ return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID);
+}
+
+/* Return true if a panic is in progress on the current CPU. */
+bool this_cpu_in_panic(void)
+{
+ /*
+ * We can use raw_smp_processor_id() here because it is impossible for
+ * the task to be migrated to the panic_cpu, or away from it. If
+ * panic_cpu has already been set, and we're not currently executing on
+ * that CPU, then we never will be.
+ */
+ return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id());
+}
+
+/*
+ * Return true if a panic is in progress on a remote CPU.
+ *
+ * On true, the local CPU should immediately release any printing resources
+ * that may be needed by the panic CPU.
+ */
+bool other_cpu_in_panic(void)
+{
+ return (panic_in_progress() && !this_cpu_in_panic());
+}
+
/*
* This is used for debugging the mess that is the VT code by
* keeping track if we have the console semaphore held. It's
* definitely not the perfect debug tool (we don't know if _WE_
* hold it and are racing, but it helps tracking those weird code
* paths in the console code where we end up in places I want
- * locked without the console sempahore held).
+ * locked without the console semaphore held).
*/
-static int console_locked, console_suspended;
-
-/*
- * If exclusive_console is non-NULL then only this console is to be printed to.
- */
-static struct console *exclusive_console;
+static int console_locked;
/*
* Array of consoles built from command line options (console=)
@@ -279,7 +392,6 @@ static struct console *exclusive_console;
static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
static int preferred_console = -1;
-static bool has_preferred_console;
int console_set_on_cmdline;
EXPORT_SYMBOL(console_set_on_cmdline);
@@ -294,30 +406,22 @@ enum con_msg_format_flags {
static int console_msg_format = MSG_FORMAT_DEFAULT;
/*
- * The printk log buffer consists of a chain of concatenated variable
- * length records. Every record starts with a record header, containing
- * the overall length of the record.
- *
- * The heads to the first and last entry in the buffer, as well as the
- * sequence numbers of these entries are maintained when messages are
- * stored.
+ * The printk log buffer consists of a sequenced collection of records, each
+ * containing variable length message text. Every record also contains its
+ * own meta-data (@info).
*
- * If the heads indicate available messages, the length in the header
- * tells the start next message. A length == 0 for the next message
- * indicates a wrap-around to the beginning of the buffer.
+ * Every record meta-data carries the timestamp in microseconds, as well as
+ * the standard userspace syslog level and syslog facility. The usual kernel
+ * messages use LOG_KERN; userspace-injected messages always carry a matching
+ * syslog facility, by default LOG_USER. The origin of every message can be
+ * reliably determined that way.
*
- * Every record carries the monotonic timestamp in microseconds, as well as
- * the standard userspace syslog level and syslog facility. The usual
- * kernel messages use LOG_KERN; userspace-injected messages always carry
- * a matching syslog facility, by default LOG_USER. The origin of every
- * message can be reliably determined that way.
+ * The human readable log message of a record is available in @text, the
+ * length of the message text in @text_len. The stored message is not
+ * terminated.
*
- * The human readable log message directly follows the message header. The
- * length of the message text is stored in the header, the stored message
- * is not terminated.
- *
- * Optionally, a message can carry a dictionary of properties (key/value pairs),
- * to provide userspace with a machine-readable message context.
+ * Optionally, a record can carry a dictionary of properties (key/value
+ * pairs), to provide userspace with a machine-readable message context.
*
* Examples for well-defined, commonly used property names are:
* DEVICE=b12:8 device identifier
@@ -327,25 +431,22 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
* +sound:card0 subsystem:devname
* SUBSYSTEM=pci driver-core subsystem name
*
- * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value
- * follows directly after a '=' character. Every property is terminated by
- * a '\0' character. The last property is not terminated.
- *
- * Example of a message structure:
- * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec
- * 0008 34 00 record is 52 bytes long
- * 000a 0b 00 text is 11 bytes long
- * 000c 1f 00 dictionary is 23 bytes long
- * 000e 03 00 LOG_KERN (facility) LOG_ERR (level)
- * 0010 69 74 27 73 20 61 20 6c "it's a l"
- * 69 6e 65 "ine"
- * 001b 44 45 56 49 43 "DEVIC"
- * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D"
- * 52 49 56 45 52 3d 62 75 "RIVER=bu"
- * 67 "g"
- * 0032 00 00 00 padding to next message header
- *
- * The 'struct printk_log' buffer header must never be directly exported to
+ * Valid characters in property names are [a-zA-Z0-9.-_]. Property names
+ * and values are terminated by a '\0' character.
+ *
+ * Example of record values:
+ * record.text_buf = "it's a line" (unterminated)
+ * record.info.seq = 56
+ * record.info.ts_nsec = 36863
+ * record.info.text_len = 11
+ * record.info.facility = 0 (LOG_KERN)
+ * record.info.flags = 0
+ * record.info.level = 3 (LOG_ERR)
+ * record.info.caller_id = 299 (task 299)
+ * record.info.dev_info.subsystem = "pci" (terminated)
+ * record.info.dev_info.device = "+pci:0000:00:01.0" (terminated)
+ *
+ * The 'struct printk_info' buffer must never be directly exported to
* userspace, it is a kernel-private implementation detail that might
* need to be changed in the future, when the requirements change.
*
@@ -360,230 +461,138 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
* non-prinatable characters are escaped in the "\xff" notation.
*/
-enum log_flags {
- LOG_NEWLINE = 2, /* text ended with a newline */
- LOG_CONT = 8, /* text is a fragment of a continuation line */
-};
-
-struct printk_log {
- u64 ts_nsec; /* timestamp in nanoseconds */
- u16 len; /* length of entire record */
- u16 text_len; /* length of text buffer */
- u16 dict_len; /* length of dictionary buffer */
- u8 facility; /* syslog facility */
- u8 flags:5; /* internal record flags */
- u8 level:3; /* syslog level */
-#ifdef CONFIG_PRINTK_CALLER
- u32 caller_id; /* thread id or processor id */
-#endif
-}
-#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-__packed __aligned(4)
-#endif
-;
+/* syslog_lock protects syslog_* variables and write access to clear_seq. */
+static DEFINE_MUTEX(syslog_lock);
/*
- * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken
- * within the scheduler's rq lock. It must be released before calling
- * console_unlock() or anything else that might wake up a process.
+ * Specifies if a legacy console is registered. If legacy consoles are
+ * present, it is necessary to perform the console lock/unlock dance
+ * whenever console flushing should occur.
*/
-DEFINE_RAW_SPINLOCK(logbuf_lock);
+bool have_legacy_console;
/*
- * Helper macros to lock/unlock logbuf_lock and switch between
- * printk-safe/unsafe modes.
+ * Specifies if an nbcon console is registered. If nbcon consoles are present,
+ * synchronous printing of legacy consoles will not occur during panic until
+ * the backtrace has been stored to the ringbuffer.
*/
-#define logbuf_lock_irq() \
- do { \
- printk_safe_enter_irq(); \
- raw_spin_lock(&logbuf_lock); \
- } while (0)
+bool have_nbcon_console;
-#define logbuf_unlock_irq() \
- do { \
- raw_spin_unlock(&logbuf_lock); \
- printk_safe_exit_irq(); \
- } while (0)
-
-#define logbuf_lock_irqsave(flags) \
- do { \
- printk_safe_enter_irqsave(flags); \
- raw_spin_lock(&logbuf_lock); \
- } while (0)
+/*
+ * Specifies if a boot console is registered. If boot consoles are present,
+ * nbcon consoles cannot print simultaneously and must be synchronized by
+ * the console lock. This is because boot consoles and nbcon consoles may
+ * have mapped the same hardware.
+ */
+bool have_boot_console;
-#define logbuf_unlock_irqrestore(flags) \
- do { \
- raw_spin_unlock(&logbuf_lock); \
- printk_safe_exit_irqrestore(flags); \
- } while (0)
+/* See printk_legacy_allow_panic_sync() for details. */
+bool legacy_allow_panic_sync;
#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
+static DECLARE_WAIT_QUEUE_HEAD(legacy_wait);
+/* All 3 protected by @syslog_lock. */
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
-static u32 syslog_idx;
static size_t syslog_partial;
static bool syslog_time;
-/* index and sequence number of the first record stored in the buffer */
-static u64 log_first_seq;
-static u32 log_first_idx;
+/* True when _all_ printer threads are available for printing. */
+bool printk_kthreads_running;
-/* index and sequence number of the next record to store in the buffer */
-static u64 log_next_seq;
-static u32 log_next_idx;
-
-/* the next printk record to write to the console */
-static u64 console_seq;
-static u32 console_idx;
-static u64 exclusive_console_stop_seq;
-
-/* the next printk record to read after the last 'clear' command */
-static u64 clear_seq;
-static u32 clear_idx;
+struct latched_seq {
+ seqcount_latch_t latch;
+ u64 val[2];
+};
-#ifdef CONFIG_PRINTK_CALLER
-#define PREFIX_MAX 48
-#else
-#define PREFIX_MAX 32
-#endif
-#define LOG_LINE_MAX (1024 - PREFIX_MAX)
+/*
+ * The next printk record to read after the last 'clear' command. There are
+ * two copies (updated with seqcount_latch) so that reads can locklessly
+ * access a valid value. Writers are synchronized by @syslog_lock.
+ */
+static struct latched_seq clear_seq = {
+ .latch = SEQCNT_LATCH_ZERO(clear_seq.latch),
+ .val[0] = 0,
+ .val[1] = 0,
+};
#define LOG_LEVEL(v) ((v) & 0x07)
#define LOG_FACILITY(v) ((v) >> 3 & 0xff)
/* record buffer */
-#define LOG_ALIGN __alignof__(struct printk_log)
+#define LOG_ALIGN __alignof__(unsigned long)
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
-#define LOG_BUF_LEN_MAX (u32)(1 << 31)
+#define LOG_BUF_LEN_MAX ((u32)1 << 31)
static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;
/*
+ * Define the average message size. This only affects the number of
+ * descriptors that will be available. Underestimating is better than
+ * overestimating (too many available descriptors is better than not enough).
+ */
+#define PRB_AVGBITS 5 /* 32 character average length */
+
+#if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS
+#error CONFIG_LOG_BUF_SHIFT value too small.
+#endif
+_DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS,
+ PRB_AVGBITS, &__log_buf[0]);
+
+static struct printk_ringbuffer printk_rb_dynamic;
+
+struct printk_ringbuffer *prb = &printk_rb_static;
+
+/*
* We cannot access per-CPU data (e.g. per-CPU flush irq_work) before
* per_cpu_areas are initialised. This variable is set to true when
* it's safe to access per-CPU data.
*/
-static bool __printk_percpu_data_ready __read_mostly;
+static bool __printk_percpu_data_ready __ro_after_init;
bool printk_percpu_data_ready(void)
{
return __printk_percpu_data_ready;
}
-/* Return log buffer address */
-char *log_buf_addr_get(void)
+/* Must be called under syslog_lock. */
+static void latched_seq_write(struct latched_seq *ls, u64 val)
{
- return log_buf;
+ write_seqcount_latch_begin(&ls->latch);
+ ls->val[0] = val;
+ write_seqcount_latch(&ls->latch);
+ ls->val[1] = val;
+ write_seqcount_latch_end(&ls->latch);
}
-/* Return log buffer size */
-u32 log_buf_len_get(void)
+/* Can be called from any context. */
+static u64 latched_seq_read_nolock(struct latched_seq *ls)
{
- return log_buf_len;
-}
-
-/* human readable text of the record */
-static char *log_text(const struct printk_log *msg)
-{
- return (char *)msg + sizeof(struct printk_log);
-}
-
-/* optional key/value pair dictionary attached to the record */
-static char *log_dict(const struct printk_log *msg)
-{
- return (char *)msg + sizeof(struct printk_log) + msg->text_len;
-}
-
-/* get record by index; idx must point to valid msg */
-static struct printk_log *log_from_idx(u32 idx)
-{
- struct printk_log *msg = (struct printk_log *)(log_buf + idx);
+ unsigned int seq;
+ unsigned int idx;
+ u64 val;
- /*
- * A length == 0 record is the end of buffer marker. Wrap around and
- * read the message at the start of the buffer.
- */
- if (!msg->len)
- return (struct printk_log *)log_buf;
- return msg;
-}
-
-/* get next record; idx must point to valid msg */
-static u32 log_next(u32 idx)
-{
- struct printk_log *msg = (struct printk_log *)(log_buf + idx);
-
- /* length == 0 indicates the end of the buffer; wrap */
- /*
- * A length == 0 record is the end of buffer marker. Wrap around and
- * read the message at the start of the buffer as *this* one, and
- * return the one after that.
- */
- if (!msg->len) {
- msg = (struct printk_log *)log_buf;
- return msg->len;
- }
- return idx + msg->len;
-}
-
-/*
- * Check whether there is enough free space for the given message.
- *
- * The same values of first_idx and next_idx mean that the buffer
- * is either empty or full.
- *
- * If the buffer is empty, we must respect the position of the indexes.
- * They cannot be reset to the beginning of the buffer.
- */
-static int logbuf_has_space(u32 msg_size, bool empty)
-{
- u32 free;
-
- if (log_next_idx > log_first_idx || empty)
- free = max(log_buf_len - log_next_idx, log_first_idx);
- else
- free = log_first_idx - log_next_idx;
+ do {
+ seq = read_seqcount_latch(&ls->latch);
+ idx = seq & 0x1;
+ val = ls->val[idx];
+ } while (read_seqcount_latch_retry(&ls->latch, seq));
- /*
- * We need space also for an empty header that signalizes wrapping
- * of the buffer.
- */
- return free >= msg_size + sizeof(struct printk_log);
+ return val;
}
-static int log_make_free_space(u32 msg_size)
+/* Return log buffer address */
+char *log_buf_addr_get(void)
{
- while (log_first_seq < log_next_seq &&
- !logbuf_has_space(msg_size, false)) {
- /* drop old messages until we have enough contiguous space */
- log_first_idx = log_next(log_first_idx);
- log_first_seq++;
- }
-
- if (clear_seq < log_first_seq) {
- clear_seq = log_first_seq;
- clear_idx = log_first_idx;
- }
-
- /* sequence numbers are equal, so the log buffer is empty */
- if (logbuf_has_space(msg_size, log_first_seq == log_next_seq))
- return 0;
-
- return -ENOMEM;
+ return log_buf;
}
-/* compute the message size including the padding bytes */
-static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len)
+/* Return log buffer size */
+u32 log_buf_len_get(void)
{
- u32 size;
-
- size = sizeof(struct printk_log) + text_len + dict_len;
- *pad_len = (-size) & (LOG_ALIGN - 1);
- size += *pad_len;
-
- return size;
+ return log_buf_len;
}
/*
@@ -594,84 +603,23 @@ static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len)
#define MAX_LOG_TAKE_PART 4
static const char trunc_msg[] = "<truncated>";
-static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len,
- u16 *dict_len, u32 *pad_len)
+static void truncate_msg(u16 *text_len, u16 *trunc_msg_len)
{
/*
* The message should not take the whole buffer. Otherwise, it might
* get removed too soon.
*/
u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART;
+
if (*text_len > max_text_len)
*text_len = max_text_len;
- /* enable the warning message */
- *trunc_msg_len = strlen(trunc_msg);
- /* disable the "dict" completely */
- *dict_len = 0;
- /* compute the size again, count also the warning message */
- return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len);
-}
-
-/* insert record into the buffer, discard old ones, update heads */
-static int log_store(u32 caller_id, int facility, int level,
- enum log_flags flags, u64 ts_nsec,
- const char *dict, u16 dict_len,
- const char *text, u16 text_len)
-{
- struct printk_log *msg;
- u32 size, pad_len;
- u16 trunc_msg_len = 0;
-
- /* number of '\0' padding bytes to next message */
- size = msg_used_size(text_len, dict_len, &pad_len);
-
- if (log_make_free_space(size)) {
- /* truncate the message if it is too long for empty buffer */
- size = truncate_msg(&text_len, &trunc_msg_len,
- &dict_len, &pad_len);
- /* survive when the log buffer is too small for trunc_msg */
- if (log_make_free_space(size))
- return 0;
- }
-
- if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) {
- /*
- * This message + an additional empty header does not fit
- * at the end of the buffer. Add an empty header with len == 0
- * to signify a wrap around.
- */
- memset(log_buf + log_next_idx, 0, sizeof(struct printk_log));
- log_next_idx = 0;
- }
- /* fill message */
- msg = (struct printk_log *)(log_buf + log_next_idx);
- memcpy(log_text(msg), text, text_len);
- msg->text_len = text_len;
- if (trunc_msg_len) {
- memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len);
- msg->text_len += trunc_msg_len;
- }
- memcpy(log_dict(msg), dict, dict_len);
- msg->dict_len = dict_len;
- msg->facility = facility;
- msg->level = level & 7;
- msg->flags = flags & 0x1f;
- if (ts_nsec > 0)
- msg->ts_nsec = ts_nsec;
+ /* enable the warning message (if there is room) */
+ *trunc_msg_len = strlen(trunc_msg);
+ if (*text_len >= *trunc_msg_len)
+ *text_len -= *trunc_msg_len;
else
- msg->ts_nsec = local_clock();
-#ifdef CONFIG_PRINTK_CALLER
- msg->caller_id = caller_id;
-#endif
- memset(log_dict(msg) + dict_len, 0, pad_len);
- msg->len = size;
-
- /* insert message */
- log_next_idx += msg->len;
- log_next_seq++;
-
- return msg->text_len;
+ *trunc_msg_len = 0;
}
int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT);
@@ -700,17 +648,6 @@ static int check_syslog_permissions(int type, int source)
if (syslog_action_restricted(type)) {
if (capable(CAP_SYSLOG))
goto ok;
- /*
- * For historical reasons, accept CAP_SYS_ADMIN too, with
- * a warning.
- */
- if (capable(CAP_SYS_ADMIN)) {
- pr_warn_once("%s (%d): Attempt to access syslog with "
- "CAP_SYS_ADMIN but no CAP_SYSLOG "
- "(deprecated).\n",
- current->comm, task_pid_nr(current));
- goto ok;
- }
return -EPERM;
}
ok:
@@ -723,13 +660,13 @@ static void append_char(char **pp, char *e, char c)
*(*pp)++ = c;
}
-static ssize_t msg_print_ext_header(char *buf, size_t size,
- struct printk_log *msg, u64 seq)
+static ssize_t info_print_ext_header(char *buf, size_t size,
+ struct printk_info *info)
{
- u64 ts_usec = msg->ts_nsec;
+ u64 ts_usec = info->ts_nsec;
char caller[20];
#ifdef CONFIG_PRINTK_CALLER
- u32 id = msg->caller_id;
+ u32 id = info->caller_id;
snprintf(caller, sizeof(caller), ",caller=%c%u",
id & 0x80000000 ? 'C' : 'T', id & ~0x80000000);
@@ -740,13 +677,13 @@ static ssize_t msg_print_ext_header(char *buf, size_t size,
do_div(ts_usec, 1000);
return scnprintf(buf, size, "%u,%llu,%llu,%c%s;",
- (msg->facility << 3) | msg->level, seq, ts_usec,
- msg->flags & LOG_CONT ? 'c' : '-', caller);
+ (info->facility << 3) | info->level, info->seq,
+ ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller);
}
-static ssize_t msg_print_ext_body(char *buf, size_t size,
- char *dict, size_t dict_len,
- char *text, size_t text_len)
+static ssize_t msg_add_ext_text(char *buf, size_t size,
+ const char *text, size_t text_len,
+ unsigned char endc)
{
char *p = buf, *e = buf + size;
size_t i;
@@ -760,45 +697,52 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
else
append_char(&p, e, c);
}
- append_char(&p, e, '\n');
+ append_char(&p, e, endc);
- if (dict_len) {
- bool line = true;
+ return p - buf;
+}
- for (i = 0; i < dict_len; i++) {
- unsigned char c = dict[i];
+static ssize_t msg_add_dict_text(char *buf, size_t size,
+ const char *key, const char *val)
+{
+ size_t val_len = strlen(val);
+ ssize_t len;
- if (line) {
- append_char(&p, e, ' ');
- line = false;
- }
+ if (!val_len)
+ return 0;
- if (c == '\0') {
- append_char(&p, e, '\n');
- line = true;
- continue;
- }
+ len = msg_add_ext_text(buf, size, "", 0, ' '); /* dict prefix */
+ len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '=');
+ len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n');
- if (c < ' ' || c >= 127 || c == '\\') {
- p += scnprintf(p, e - p, "\\x%02x", c);
- continue;
- }
+ return len;
+}
- append_char(&p, e, c);
- }
- append_char(&p, e, '\n');
- }
+static ssize_t msg_print_ext_body(char *buf, size_t size,
+ char *text, size_t text_len,
+ struct dev_printk_info *dev_info)
+{
+ ssize_t len;
- return p - buf;
+ len = msg_add_ext_text(buf, size, text, text_len, '\n');
+
+ if (!dev_info)
+ goto out;
+
+ len += msg_add_dict_text(buf + len, size - len, "SUBSYSTEM",
+ dev_info->subsystem);
+ len += msg_add_dict_text(buf + len, size - len, "DEVICE",
+ dev_info->device);
+out:
+ return len;
}
/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
- u64 seq;
- u32 idx;
+ atomic64_t seq;
struct ratelimit_state rs;
struct mutex lock;
- char buf[CONSOLE_EXT_LOG_MAX];
+ struct printk_buffers pbufs;
};
static __printf(3, 4) __cold
@@ -808,7 +752,7 @@ int devkmsg_emit(int facility, int level, const char *fmt, ...)
int r;
va_start(args, fmt);
- r = vprintk_emit(facility, level, NULL, 0, fmt, args);
+ r = vprintk_emit(facility, level, NULL, fmt, args);
va_end(args);
return r;
@@ -824,7 +768,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
size_t len = iov_iter_count(from);
ssize_t ret = len;
- if (!user || len > LOG_LINE_MAX)
+ if (len > PRINTKRB_RECORD_MAX)
return -EINVAL;
/* Ignore when user logging is disabled. */
@@ -867,7 +811,6 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
if (LOG_FACILITY(u) != 0)
facility = LOG_FACILITY(u);
endp++;
- len -= endp - line;
line = endp;
}
}
@@ -881,84 +824,83 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
struct devkmsg_user *user = file->private_data;
- struct printk_log *msg;
- size_t len;
+ char *outbuf = &user->pbufs.outbuf[0];
+ struct printk_message pmsg = {
+ .pbufs = &user->pbufs,
+ };
ssize_t ret;
- if (!user)
- return -EBADF;
-
ret = mutex_lock_interruptible(&user->lock);
if (ret)
return ret;
- logbuf_lock_irq();
- while (user->seq == log_next_seq) {
+ if (!printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, false)) {
if (file->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
- logbuf_unlock_irq();
goto out;
}
- logbuf_unlock_irq();
+ /*
+ * Guarantee this task is visible on the waitqueue before
+ * checking the wake condition.
+ *
+ * The full memory barrier within set_current_state() of
+ * prepare_to_wait_event() pairs with the full memory barrier
+ * within wq_has_sleeper().
+ *
+ * This pairs with __wake_up_klogd:A.
+ */
ret = wait_event_interruptible(log_wait,
- user->seq != log_next_seq);
+ printk_get_next_message(&pmsg, atomic64_read(&user->seq), true,
+ false)); /* LMM(devkmsg_read:A) */
if (ret)
goto out;
- logbuf_lock_irq();
}
- if (user->seq < log_first_seq) {
+ if (pmsg.dropped) {
/* our last seen message is gone, return error and reset */
- user->idx = log_first_idx;
- user->seq = log_first_seq;
+ atomic64_set(&user->seq, pmsg.seq);
ret = -EPIPE;
- logbuf_unlock_irq();
goto out;
}
- msg = log_from_idx(user->idx);
- len = msg_print_ext_header(user->buf, sizeof(user->buf),
- msg, user->seq);
- len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len,
- log_dict(msg), msg->dict_len,
- log_text(msg), msg->text_len);
+ atomic64_set(&user->seq, pmsg.seq + 1);
- user->idx = log_next(user->idx);
- user->seq++;
- logbuf_unlock_irq();
-
- if (len > count) {
+ if (pmsg.outbuf_len > count) {
ret = -EINVAL;
goto out;
}
- if (copy_to_user(buf, user->buf, len)) {
+ if (copy_to_user(buf, outbuf, pmsg.outbuf_len)) {
ret = -EFAULT;
goto out;
}
- ret = len;
+ ret = pmsg.outbuf_len;
out:
mutex_unlock(&user->lock);
return ret;
}
+/*
+ * Be careful when modifying this function!!!
+ *
+ * Only few operations are supported because the device works only with the
+ * entire variable length messages (records). Non-standard values are
+ * returned in the other cases and has been this way for quite some time.
+ * User space applications might depend on this behavior.
+ */
static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
{
struct devkmsg_user *user = file->private_data;
loff_t ret = 0;
- if (!user)
- return -EBADF;
if (offset)
return -ESPIPE;
- logbuf_lock_irq();
switch (whence) {
case SEEK_SET:
/* the first record */
- user->idx = log_first_idx;
- user->seq = log_first_seq;
+ atomic64_set(&user->seq, prb_first_valid_seq(prb));
break;
case SEEK_DATA:
/*
@@ -966,40 +908,33 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
* like issued by 'dmesg -c'. Reading /dev/kmsg itself
* changes no global state, and does not clear anything.
*/
- user->idx = clear_idx;
- user->seq = clear_seq;
+ atomic64_set(&user->seq, latched_seq_read_nolock(&clear_seq));
break;
case SEEK_END:
/* after the last record */
- user->idx = log_next_idx;
- user->seq = log_next_seq;
+ atomic64_set(&user->seq, prb_next_seq(prb));
break;
default:
ret = -EINVAL;
}
- logbuf_unlock_irq();
return ret;
}
static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
{
struct devkmsg_user *user = file->private_data;
+ struct printk_info info;
__poll_t ret = 0;
- if (!user)
- return EPOLLERR|EPOLLNVAL;
-
poll_wait(file, &log_wait, wait);
- logbuf_lock_irq();
- if (user->seq < log_next_seq) {
+ if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) {
/* return error when data has vanished underneath us */
- if (user->seq < log_first_seq)
+ if (info.seq != atomic64_read(&user->seq))
ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
else
ret = EPOLLIN|EPOLLRDNORM;
}
- logbuf_unlock_irq();
return ret;
}
@@ -1020,7 +955,7 @@ static int devkmsg_open(struct inode *inode, struct file *file)
return err;
}
- user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
+ user = kvmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
if (!user)
return -ENOMEM;
@@ -1029,10 +964,7 @@ static int devkmsg_open(struct inode *inode, struct file *file)
mutex_init(&user->lock);
- logbuf_lock_irq();
- user->idx = log_first_idx;
- user->seq = log_first_seq;
- logbuf_unlock_irq();
+ atomic64_set(&user->seq, prb_first_valid_seq(prb));
file->private_data = user;
return 0;
@@ -1042,13 +974,10 @@ static int devkmsg_release(struct inode *inode, struct file *file)
{
struct devkmsg_user *user = file->private_data;
- if (!user)
- return 0;
-
ratelimit_state_exit(&user->rs);
mutex_destroy(&user->lock);
- kfree(user);
+ kvfree(user);
return 0;
}
@@ -1061,7 +990,7 @@ const struct file_operations kmsg_fops = {
.release = devkmsg_release,
};
-#ifdef CONFIG_CRASH_CORE
+#ifdef CONFIG_VMCORE_INFO
/*
* This appends the listed symbols to /proc/vmcore
*
@@ -1072,23 +1001,61 @@ const struct file_operations kmsg_fops = {
*/
void log_buf_vmcoreinfo_setup(void)
{
- VMCOREINFO_SYMBOL(log_buf);
- VMCOREINFO_SYMBOL(log_buf_len);
- VMCOREINFO_SYMBOL(log_first_idx);
- VMCOREINFO_SYMBOL(clear_idx);
- VMCOREINFO_SYMBOL(log_next_idx);
+ struct dev_printk_info *dev_info = NULL;
+
+ VMCOREINFO_SYMBOL(prb);
+ VMCOREINFO_SYMBOL(printk_rb_static);
+ VMCOREINFO_SYMBOL(clear_seq);
+
/*
- * Export struct printk_log size and field offsets. User space tools can
+ * Export struct size and field offsets. User space tools can
* parse it and detect any changes to structure down the line.
*/
- VMCOREINFO_STRUCT_SIZE(printk_log);
- VMCOREINFO_OFFSET(printk_log, ts_nsec);
- VMCOREINFO_OFFSET(printk_log, len);
- VMCOREINFO_OFFSET(printk_log, text_len);
- VMCOREINFO_OFFSET(printk_log, dict_len);
-#ifdef CONFIG_PRINTK_CALLER
- VMCOREINFO_OFFSET(printk_log, caller_id);
-#endif
+
+ VMCOREINFO_STRUCT_SIZE(printk_ringbuffer);
+ VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring);
+ VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring);
+ VMCOREINFO_OFFSET(printk_ringbuffer, fail);
+
+ VMCOREINFO_STRUCT_SIZE(prb_desc_ring);
+ VMCOREINFO_OFFSET(prb_desc_ring, count_bits);
+ VMCOREINFO_OFFSET(prb_desc_ring, descs);
+ VMCOREINFO_OFFSET(prb_desc_ring, infos);
+ VMCOREINFO_OFFSET(prb_desc_ring, head_id);
+ VMCOREINFO_OFFSET(prb_desc_ring, tail_id);
+
+ VMCOREINFO_STRUCT_SIZE(prb_desc);
+ VMCOREINFO_OFFSET(prb_desc, state_var);
+ VMCOREINFO_OFFSET(prb_desc, text_blk_lpos);
+
+ VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos);
+ VMCOREINFO_OFFSET(prb_data_blk_lpos, begin);
+ VMCOREINFO_OFFSET(prb_data_blk_lpos, next);
+
+ VMCOREINFO_STRUCT_SIZE(printk_info);
+ VMCOREINFO_OFFSET(printk_info, seq);
+ VMCOREINFO_OFFSET(printk_info, ts_nsec);
+ VMCOREINFO_OFFSET(printk_info, text_len);
+ VMCOREINFO_OFFSET(printk_info, caller_id);
+ VMCOREINFO_OFFSET(printk_info, dev_info);
+
+ VMCOREINFO_STRUCT_SIZE(dev_printk_info);
+ VMCOREINFO_OFFSET(dev_printk_info, subsystem);
+ VMCOREINFO_LENGTH(printk_info_subsystem, sizeof(dev_info->subsystem));
+ VMCOREINFO_OFFSET(dev_printk_info, device);
+ VMCOREINFO_LENGTH(printk_info_device, sizeof(dev_info->device));
+
+ VMCOREINFO_STRUCT_SIZE(prb_data_ring);
+ VMCOREINFO_OFFSET(prb_data_ring, size_bits);
+ VMCOREINFO_OFFSET(prb_data_ring, data);
+ VMCOREINFO_OFFSET(prb_data_ring, head_lpos);
+ VMCOREINFO_OFFSET(prb_data_ring, tail_lpos);
+
+ VMCOREINFO_SIZE(atomic_long_t);
+ VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter);
+
+ VMCOREINFO_STRUCT_SIZE(latched_seq);
+ VMCOREINFO_OFFSET(latched_seq, val);
}
#endif
@@ -1160,17 +1127,61 @@ static inline void log_buf_add_cpu(void) {}
static void __init set_percpu_data_ready(void)
{
- printk_safe_init();
- /* Make sure we set this flag only after printk_safe() init is done */
- barrier();
__printk_percpu_data_ready = true;
}
+static unsigned int __init add_to_rb(struct printk_ringbuffer *rb,
+ struct printk_record *r)
+{
+ struct prb_reserved_entry e;
+ struct printk_record dest_r;
+
+ prb_rec_init_wr(&dest_r, r->info->text_len);
+
+ if (!prb_reserve(&e, rb, &dest_r))
+ return 0;
+
+ memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len);
+ dest_r.info->text_len = r->info->text_len;
+ dest_r.info->facility = r->info->facility;
+ dest_r.info->level = r->info->level;
+ dest_r.info->flags = r->info->flags;
+ dest_r.info->ts_nsec = r->info->ts_nsec;
+ dest_r.info->caller_id = r->info->caller_id;
+ memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info));
+
+ prb_final_commit(&e);
+
+ return prb_record_text_space(&e);
+}
+
+static char setup_text_buf[PRINTKRB_RECORD_MAX] __initdata;
+
+static void print_log_buf_usage_stats(void)
+{
+ unsigned int descs_count = log_buf_len >> PRB_AVGBITS;
+ size_t meta_data_size;
+
+ meta_data_size = descs_count * (sizeof(struct prb_desc) + sizeof(struct printk_info));
+
+ pr_info("log buffer data + meta data: %u + %zu = %zu bytes\n",
+ log_buf_len, meta_data_size, log_buf_len + meta_data_size);
+}
+
void __init setup_log_buf(int early)
{
+ struct printk_info *new_infos;
+ unsigned int new_descs_count;
+ struct prb_desc *new_descs;
+ struct printk_info info;
+ struct printk_record r;
+ unsigned int text_size;
+ size_t new_descs_size;
+ size_t new_infos_size;
unsigned long flags;
char *new_log_buf;
unsigned int free;
+ u64 seq;
/*
* Some archs call setup_log_buf() multiple times - first is very
@@ -1186,27 +1197,98 @@ void __init setup_log_buf(int early)
if (!early && !new_log_buf_len)
log_buf_add_cpu();
- if (!new_log_buf_len)
+ if (!new_log_buf_len) {
+ /* Show the memory stats only once. */
+ if (!early)
+ goto out;
+
return;
+ }
+
+ new_descs_count = new_log_buf_len >> PRB_AVGBITS;
+ if (new_descs_count == 0) {
+ pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len);
+ goto out;
+ }
new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN);
if (unlikely(!new_log_buf)) {
- pr_err("log_buf_len: %lu bytes not available\n",
- new_log_buf_len);
- return;
+ pr_err("log_buf_len: %lu text bytes not available\n",
+ new_log_buf_len);
+ goto out;
+ }
+
+ new_descs_size = new_descs_count * sizeof(struct prb_desc);
+ new_descs = memblock_alloc(new_descs_size, LOG_ALIGN);
+ if (unlikely(!new_descs)) {
+ pr_err("log_buf_len: %zu desc bytes not available\n",
+ new_descs_size);
+ goto err_free_log_buf;
+ }
+
+ new_infos_size = new_descs_count * sizeof(struct printk_info);
+ new_infos = memblock_alloc(new_infos_size, LOG_ALIGN);
+ if (unlikely(!new_infos)) {
+ pr_err("log_buf_len: %zu info bytes not available\n",
+ new_infos_size);
+ goto err_free_descs;
}
- logbuf_lock_irqsave(flags);
+ prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf));
+
+ prb_init(&printk_rb_dynamic,
+ new_log_buf, ilog2(new_log_buf_len),
+ new_descs, ilog2(new_descs_count),
+ new_infos);
+
+ local_irq_save(flags);
+
log_buf_len = new_log_buf_len;
log_buf = new_log_buf;
new_log_buf_len = 0;
- free = __LOG_BUF_LEN - log_next_idx;
- memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
- logbuf_unlock_irqrestore(flags);
- pr_info("log_buf_len: %u bytes\n", log_buf_len);
+ free = __LOG_BUF_LEN;
+ prb_for_each_record(0, &printk_rb_static, seq, &r) {
+ text_size = add_to_rb(&printk_rb_dynamic, &r);
+ if (text_size > free)
+ free = 0;
+ else
+ free -= text_size;
+ }
+
+ prb = &printk_rb_dynamic;
+
+ local_irq_restore(flags);
+
+ /*
+ * Copy any remaining messages that might have appeared from
+ * NMI context after copying but before switching to the
+ * dynamic buffer.
+ */
+ prb_for_each_record(seq, &printk_rb_static, seq, &r) {
+ text_size = add_to_rb(&printk_rb_dynamic, &r);
+ if (text_size > free)
+ free = 0;
+ else
+ free -= text_size;
+ }
+
+ if (seq != prb_next_seq(&printk_rb_static)) {
+ pr_err("dropped %llu messages\n",
+ prb_next_seq(&printk_rb_static) - seq);
+ }
+
+ print_log_buf_usage_stats();
pr_info("early log buf free: %u(%u%%)\n",
free, (free * 100) / __LOG_BUF_LEN);
+ return;
+
+err_free_descs:
+ memblock_free(new_descs, new_descs_size);
+err_free_log_buf:
+ memblock_free(new_log_buf, new_log_buf_len);
+out:
+ print_log_buf_usage_stats();
}
static bool __read_mostly ignore_loglevel;
@@ -1256,11 +1338,11 @@ static void boot_delay_msec(int level)
{
unsigned long long k;
unsigned long timeout;
+ bool suppress = !is_printk_force_console() &&
+ suppress_message_printing(level);
- if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING)
- || suppress_message_printing(level)) {
+ if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING) || suppress)
return;
- }
k = (unsigned long long)loops_per_msec * boot_delay;
@@ -1313,18 +1395,18 @@ static size_t print_caller(u32 id, char *buf)
#define print_caller(id, buf) 0
#endif
-static size_t print_prefix(const struct printk_log *msg, bool syslog,
- bool time, char *buf)
+static size_t info_print_prefix(const struct printk_info *info, bool syslog,
+ bool time, char *buf)
{
size_t len = 0;
if (syslog)
- len = print_syslog((msg->facility << 3) | msg->level, buf);
+ len = print_syslog((info->facility << 3) | info->level, buf);
if (time)
- len += print_time(msg->ts_nsec, buf + len);
+ len += print_time(info->ts_nsec, buf + len);
- len += print_caller(msg->caller_id, buf + len);
+ len += print_caller(info->caller_id, buf + len);
if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) {
buf[len++] = ' ';
@@ -1334,71 +1416,244 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog,
return len;
}
-static size_t msg_print_text(const struct printk_log *msg, bool syslog,
- bool time, char *buf, size_t size)
+/*
+ * Prepare the record for printing. The text is shifted within the given
+ * buffer to avoid a need for another one. The following operations are
+ * done:
+ *
+ * - Add prefix for each line.
+ * - Drop truncated lines that no longer fit into the buffer.
+ * - Add the trailing newline that has been removed in vprintk_store().
+ * - Add a string terminator.
+ *
+ * Since the produced string is always terminated, the maximum possible
+ * return value is @r->text_buf_size - 1;
+ *
+ * Return: The length of the updated/prepared text, including the added
+ * prefixes and the newline. The terminator is not counted. The dropped
+ * line(s) are not counted.
+ */
+static size_t record_print_text(struct printk_record *r, bool syslog,
+ bool time)
{
- const char *text = log_text(msg);
- size_t text_size = msg->text_len;
+ size_t text_len = r->info->text_len;
+ size_t buf_size = r->text_buf_size;
+ char *text = r->text_buf;
+ char prefix[PRINTK_PREFIX_MAX];
+ bool truncated = false;
+ size_t prefix_len;
+ size_t line_len;
size_t len = 0;
- char prefix[PREFIX_MAX];
- const size_t prefix_len = print_prefix(msg, syslog, time, prefix);
+ char *next;
- do {
- const char *next = memchr(text, '\n', text_size);
- size_t text_len;
+ /*
+ * If the message was truncated because the buffer was not large
+ * enough, treat the available text as if it were the full text.
+ */
+ if (text_len > buf_size)
+ text_len = buf_size;
+ prefix_len = info_print_prefix(r->info, syslog, time, prefix);
+
+ /*
+ * @text_len: bytes of unprocessed text
+ * @line_len: bytes of current line _without_ newline
+ * @text: pointer to beginning of current line
+ * @len: number of bytes prepared in r->text_buf
+ */
+ for (;;) {
+ next = memchr(text, '\n', text_len);
if (next) {
- text_len = next - text;
- next++;
- text_size -= next - text;
+ line_len = next - text;
} else {
- text_len = text_size;
+ /* Drop truncated line(s). */
+ if (truncated)
+ break;
+ line_len = text_len;
}
- if (buf) {
- if (prefix_len + text_len + 1 >= size - len)
+ /*
+ * Truncate the text if there is not enough space to add the
+ * prefix and a trailing newline and a terminator.
+ */
+ if (len + prefix_len + text_len + 1 + 1 > buf_size) {
+ /* Drop even the current line if no space. */
+ if (len + prefix_len + line_len + 1 + 1 > buf_size)
break;
- memcpy(buf + len, prefix, prefix_len);
- len += prefix_len;
- memcpy(buf + len, text, text_len);
- len += text_len;
- buf[len++] = '\n';
- } else {
- /* SYSLOG_ACTION_* buffer size only calculation */
- len += prefix_len + text_len + 1;
+ text_len = buf_size - len - prefix_len - 1 - 1;
+ truncated = true;
}
- text = next;
- } while (text);
+ memmove(text + prefix_len, text, text_len);
+ memcpy(text, prefix, prefix_len);
+
+ /*
+ * Increment the prepared length to include the text and
+ * prefix that were just moved+copied. Also increment for the
+ * newline at the end of this line. If this is the last line,
+ * there is no newline, but it will be added immediately below.
+ */
+ len += prefix_len + line_len + 1;
+ if (text_len == line_len) {
+ /*
+ * This is the last line. Add the trailing newline
+ * removed in vprintk_store().
+ */
+ text[prefix_len + line_len] = '\n';
+ break;
+ }
+
+ /*
+ * Advance beyond the added prefix and the related line with
+ * its newline.
+ */
+ text += prefix_len + line_len + 1;
+
+ /*
+ * The remaining text has only decreased by the line with its
+ * newline.
+ *
+ * Note that @text_len can become zero. It happens when @text
+ * ended with a newline (either due to truncation or the
+ * original string ending with "\n\n"). The loop is correctly
+ * repeated and (if not truncated) an empty line with a prefix
+ * will be prepared.
+ */
+ text_len -= line_len + 1;
+ }
+
+ /*
+ * If a buffer was provided, it will be terminated. Space for the
+ * string terminator is guaranteed to be available. The terminator is
+ * not counted in the return value.
+ */
+ if (buf_size > 0)
+ r->text_buf[len] = 0;
return len;
}
+static size_t get_record_print_text_size(struct printk_info *info,
+ unsigned int line_count,
+ bool syslog, bool time)
+{
+ char prefix[PRINTK_PREFIX_MAX];
+ size_t prefix_len;
+
+ prefix_len = info_print_prefix(info, syslog, time, prefix);
+
+ /*
+ * Each line will be preceded with a prefix. The intermediate
+ * newlines are already within the text, but a final trailing
+ * newline will be added.
+ */
+ return ((prefix_len * line_count) + info->text_len + 1);
+}
+
+/*
+ * Beginning with @start_seq, find the first record where it and all following
+ * records up to (but not including) @max_seq fit into @size.
+ *
+ * @max_seq is simply an upper bound and does not need to exist. If the caller
+ * does not require an upper bound, -1 can be used for @max_seq.
+ */
+static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size,
+ bool syslog, bool time)
+{
+ struct printk_info info;
+ unsigned int line_count;
+ size_t len = 0;
+ u64 seq;
+
+ /* Determine the size of the records up to @max_seq. */
+ prb_for_each_info(start_seq, prb, seq, &info, &line_count) {
+ if (info.seq >= max_seq)
+ break;
+ len += get_record_print_text_size(&info, line_count, syslog, time);
+ }
+
+ /*
+ * Adjust the upper bound for the next loop to avoid subtracting
+ * lengths that were never added.
+ */
+ if (seq < max_seq)
+ max_seq = seq;
+
+ /*
+ * Move first record forward until length fits into the buffer. Ignore
+ * newest messages that were not counted in the above cycle. Messages
+ * might appear and get lost in the meantime. This is a best effort
+ * that prevents an infinite loop that could occur with a retry.
+ */
+ prb_for_each_info(start_seq, prb, seq, &info, &line_count) {
+ if (len <= size || info.seq >= max_seq)
+ break;
+ len -= get_record_print_text_size(&info, line_count, syslog, time);
+ }
+
+ return seq;
+}
+
+/* The caller is responsible for making sure @size is greater than 0. */
static int syslog_print(char __user *buf, int size)
{
+ struct printk_info info;
+ struct printk_record r;
char *text;
- struct printk_log *msg;
int len = 0;
+ u64 seq;
- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
+ text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
- while (size > 0) {
+ prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);
+
+ mutex_lock(&syslog_lock);
+
+ /*
+ * Wait for the @syslog_seq record to be available. @syslog_seq may
+ * change while waiting.
+ */
+ do {
+ seq = syslog_seq;
+
+ mutex_unlock(&syslog_lock);
+ /*
+ * Guarantee this task is visible on the waitqueue before
+ * checking the wake condition.
+ *
+ * The full memory barrier within set_current_state() of
+ * prepare_to_wait_event() pairs with the full memory barrier
+ * within wq_has_sleeper().
+ *
+ * This pairs with __wake_up_klogd:A.
+ */
+ len = wait_event_interruptible(log_wait,
+ prb_read_valid(prb, seq, NULL)); /* LMM(syslog_print:A) */
+ mutex_lock(&syslog_lock);
+
+ if (len)
+ goto out;
+ } while (syslog_seq != seq);
+
+ /*
+ * Copy records that fit into the buffer. The above cycle makes sure
+ * that the first record is always available.
+ */
+ do {
size_t n;
size_t skip;
+ int err;
- logbuf_lock_irq();
- if (syslog_seq < log_first_seq) {
- /* messages are gone, move to first one */
- syslog_seq = log_first_seq;
- syslog_idx = log_first_idx;
- syslog_partial = 0;
- }
- if (syslog_seq == log_next_seq) {
- logbuf_unlock_irq();
+ if (!prb_read_valid(prb, syslog_seq, &r))
break;
+
+ if (r.info->seq != syslog_seq) {
+ /* message is gone, move to next valid one */
+ syslog_seq = r.info->seq;
+ syslog_partial = 0;
}
/*
@@ -1409,13 +1664,10 @@ static int syslog_print(char __user *buf, int size)
syslog_time = printk_time;
skip = syslog_partial;
- msg = log_from_idx(syslog_idx);
- n = msg_print_text(msg, true, syslog_time, text,
- LOG_LINE_MAX + PREFIX_MAX);
+ n = record_print_text(&r, true, syslog_time);
if (n - syslog_partial <= size) {
/* message fits into buffer, move forward */
- syslog_idx = log_next(syslog_idx);
- syslog_seq++;
+ syslog_seq = r.info->seq + 1;
n -= syslog_partial;
syslog_partial = 0;
} else if (!len){
@@ -1424,12 +1676,15 @@ static int syslog_print(char __user *buf, int size)
syslog_partial += n;
} else
n = 0;
- logbuf_unlock_irq();
if (!n)
break;
- if (copy_to_user(buf, text + skip, n)) {
+ mutex_unlock(&syslog_lock);
+ err = copy_to_user(buf, text + skip, n);
+ mutex_lock(&syslog_lock);
+
+ if (err) {
if (!len)
len = -EFAULT;
break;
@@ -1438,83 +1693,60 @@ static int syslog_print(char __user *buf, int size)
len += n;
size -= n;
buf += n;
- }
-
+ } while (size);
+out:
+ mutex_unlock(&syslog_lock);
kfree(text);
return len;
}
static int syslog_print_all(char __user *buf, int size, bool clear)
{
+ struct printk_info info;
+ struct printk_record r;
char *text;
int len = 0;
- u64 next_seq;
u64 seq;
- u32 idx;
bool time;
- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
+ text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
time = printk_time;
- logbuf_lock_irq();
/*
* Find first record that fits, including all following records,
* into the user-provided buffer for this dump.
*/
- seq = clear_seq;
- idx = clear_idx;
- while (seq < log_next_seq) {
- struct printk_log *msg = log_from_idx(idx);
-
- len += msg_print_text(msg, true, time, NULL, 0);
- idx = log_next(idx);
- seq++;
- }
-
- /* move first record forward until length fits into the buffer */
- seq = clear_seq;
- idx = clear_idx;
- while (len > size && seq < log_next_seq) {
- struct printk_log *msg = log_from_idx(idx);
+ seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1,
+ size, true, time);
- len -= msg_print_text(msg, true, time, NULL, 0);
- idx = log_next(idx);
- seq++;
- }
+ prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);
- /* last message fitting into this dump */
- next_seq = log_next_seq;
+ prb_for_each_record(seq, prb, seq, &r) {
+ int textlen;
- len = 0;
- while (len >= 0 && seq < next_seq) {
- struct printk_log *msg = log_from_idx(idx);
- int textlen = msg_print_text(msg, true, time, text,
- LOG_LINE_MAX + PREFIX_MAX);
+ textlen = record_print_text(&r, true, time);
- idx = log_next(idx);
- seq++;
+ if (len + textlen > size) {
+ seq--;
+ break;
+ }
- logbuf_unlock_irq();
if (copy_to_user(buf + len, text, textlen))
len = -EFAULT;
else
len += textlen;
- logbuf_lock_irq();
- if (seq < log_first_seq) {
- /* messages are gone, move to next one */
- seq = log_first_seq;
- idx = log_first_idx;
- }
+ if (len < 0)
+ break;
}
if (clear) {
- clear_seq = log_next_seq;
- clear_idx = log_next_idx;
+ mutex_lock(&syslog_lock);
+ latched_seq_write(&clear_seq, seq);
+ mutex_unlock(&syslog_lock);
}
- logbuf_unlock_irq();
kfree(text);
return len;
@@ -1522,14 +1754,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
static void syslog_clear(void)
{
- logbuf_lock_irq();
- clear_seq = log_next_seq;
- clear_idx = log_next_idx;
- logbuf_unlock_irq();
+ mutex_lock(&syslog_lock);
+ latched_seq_write(&clear_seq, prb_next_seq(prb));
+ mutex_unlock(&syslog_lock);
}
int do_syslog(int type, char __user *buf, int len, int source)
{
+ struct printk_info info;
bool clear = false;
static int saved_console_loglevel = LOGLEVEL_DEFAULT;
int error;
@@ -1550,16 +1782,12 @@ int do_syslog(int type, char __user *buf, int len, int source)
return 0;
if (!access_ok(buf, len))
return -EFAULT;
- error = wait_event_interruptible(log_wait,
- syslog_seq != log_next_seq);
- if (error)
- return error;
error = syslog_print(buf, len);
break;
/* Read/clear last kernel messages */
case SYSLOG_ACTION_READ_CLEAR:
clear = true;
- /* FALL THRU */
+ fallthrough;
/* Read last kernel messages */
case SYSLOG_ACTION_READ_ALL:
if (!buf || len < 0)
@@ -1599,11 +1827,15 @@ int do_syslog(int type, char __user *buf, int len, int source)
break;
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
- logbuf_lock_irq();
- if (syslog_seq < log_first_seq) {
+ mutex_lock(&syslog_lock);
+ if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) {
+ /* No unread messages. */
+ mutex_unlock(&syslog_lock);
+ return 0;
+ }
+ if (info.seq != syslog_seq) {
/* messages are gone, move to first one */
- syslog_seq = log_first_seq;
- syslog_idx = log_first_idx;
+ syslog_seq = info.seq;
syslog_partial = 0;
}
if (source == SYSLOG_FROM_PROC) {
@@ -1612,24 +1844,21 @@ int do_syslog(int type, char __user *buf, int len, int source)
* for pending data, not the size; return the count of
* records, not the length.
*/
- error = log_next_seq - syslog_seq;
+ error = prb_next_seq(prb) - syslog_seq;
} else {
- u64 seq = syslog_seq;
- u32 idx = syslog_idx;
bool time = syslog_partial ? syslog_time : printk_time;
+ unsigned int line_count;
+ u64 seq;
- while (seq < log_next_seq) {
- struct printk_log *msg = log_from_idx(idx);
-
- error += msg_print_text(msg, true, time, NULL,
- 0);
+ prb_for_each_info(syslog_seq, prb, seq, &info,
+ &line_count) {
+ error += get_record_print_text_size(&info, line_count,
+ true, time);
time = printk_time;
- idx = log_next(idx);
- seq++;
}
error -= syslog_partial;
}
- logbuf_unlock_irq();
+ mutex_unlock(&syslog_lock);
break;
/* Size of the log buffer */
case SYSLOG_ACTION_SIZE_BUFFER:
@@ -1672,12 +1901,25 @@ static bool console_waiter;
* there may be a waiter spinning (like a spinlock). Also it must be
* ready to hand over the lock at the end of the section.
*/
-static void console_lock_spinning_enable(void)
+void console_lock_spinning_enable(void)
{
+ /*
+ * Do not use spinning in panic(). The panic CPU wants to keep the lock.
+ * Non-panic CPUs abandon the flush anyway.
+ *
+ * Just keep the lockdep annotation. The panic-CPU should avoid
+ * taking console_owner_lock because it might cause a deadlock.
+ * This looks like the easiest way how to prevent false lockdep
+ * reports without handling races a lockless way.
+ */
+ if (panic_in_progress())
+ goto lockdep;
+
raw_spin_lock(&console_owner_lock);
console_owner = current;
raw_spin_unlock(&console_owner_lock);
+lockdep:
/* The waiter may spin on us after setting console_owner */
spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
}
@@ -1685,22 +1927,39 @@ static void console_lock_spinning_enable(void)
/**
* console_lock_spinning_disable_and_check - mark end of code where another
* thread was able to busy wait and check if there is a waiter
+ * @cookie: cookie returned from console_srcu_read_lock()
*
* This is called at the end of the section where spinning is allowed.
* It has two functions. First, it is a signal that it is no longer
* safe to start busy waiting for the lock. Second, it checks if
* there is a busy waiter and passes the lock rights to her.
*
- * Important: Callers lose the lock if there was a busy waiter.
- * They must not touch items synchronized by console_lock
- * in this case.
+ * Important: Callers lose both the console_lock and the SRCU read lock if
+ * there was a busy waiter. They must not touch items synchronized by
+ * console_lock or SRCU read lock in this case.
*
* Return: 1 if the lock rights were passed, 0 otherwise.
*/
-static int console_lock_spinning_disable_and_check(void)
+int console_lock_spinning_disable_and_check(int cookie)
{
int waiter;
+ /*
+ * Ignore spinning waiters during panic() because they might get stopped
+ * or blocked at any time,
+ *
+ * It is safe because nobody is allowed to start spinning during panic
+ * in the first place. If there has been a waiter then non panic CPUs
+ * might stay spinning. They would get stopped anyway. The panic context
+ * will never start spinning and an interrupted spin on panic CPU will
+ * never continue.
+ */
+ if (panic_in_progress()) {
+ /* Keep lockdep happy. */
+ spin_release(&console_owner_dep_map, _THIS_IP_);
+ return 0;
+ }
+
raw_spin_lock(&console_owner_lock);
waiter = READ_ONCE(console_waiter);
console_owner = NULL;
@@ -1717,6 +1976,12 @@ static int console_lock_spinning_disable_and_check(void)
spin_release(&console_owner_dep_map, _THIS_IP_);
/*
+ * Preserve lockdep lock ordering. Release the SRCU read lock before
+ * releasing the console_lock.
+ */
+ console_srcu_read_unlock(cookie);
+
+ /*
* Hand off console_lock to waiter. The waiter will perform
* the up(). After this, the waiter is the console_lock owner.
*/
@@ -1744,6 +2009,16 @@ static int console_trylock_spinning(void)
if (console_trylock())
return 1;
+ /*
+ * It's unsafe to spin once a panic has begun. If we are the
+ * panic CPU, we may have already halted the owner of the
+ * console_sem. If we are not the panic CPU, then we should
+ * avoid taking console_sem, so the panic CPU has a better
+ * chance of cleanly acquiring it later.
+ */
+ if (panic_in_progress())
+ return 0;
+
printk_safe_enter_irqsave(flags);
raw_spin_lock(&console_owner_lock);
@@ -1785,42 +2060,91 @@ static int console_trylock_spinning(void)
*/
mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_);
+ /*
+ * Update @console_may_schedule for trylock because the previous
+ * owner may have been schedulable.
+ */
+ console_may_schedule = 0;
+
return 1;
}
/*
- * Call the console drivers, asking them to write out
- * log_buf[start] to log_buf[end - 1].
- * The console_lock must be held.
+ * Recursion is tracked separately on each CPU. If NMIs are supported, an
+ * additional NMI context per CPU is also separately tracked. Until per-CPU
+ * is available, a separate "early tracking" is performed.
*/
-static void call_console_drivers(const char *ext_text, size_t ext_len,
- const char *text, size_t len)
-{
- struct console *con;
+static DEFINE_PER_CPU(u8, printk_count);
+static u8 printk_count_early;
+#ifdef CONFIG_HAVE_NMI
+static DEFINE_PER_CPU(u8, printk_count_nmi);
+static u8 printk_count_nmi_early;
+#endif
- trace_console_rcuidle(text, len);
+/*
+ * Recursion is limited to keep the output sane. printk() should not require
+ * more than 1 level of recursion (allowing, for example, printk() to trigger
+ * a WARN), but a higher value is used in case some printk-internal errors
+ * exist, such as the ringbuffer validation checks failing.
+ */
+#define PRINTK_MAX_RECURSION 3
- for_each_console(con) {
- if (exclusive_console && con != exclusive_console)
- continue;
- if (!(con->flags & CON_ENABLED))
- continue;
- if (!con->write)
- continue;
- if (!cpu_online(smp_processor_id()) &&
- !(con->flags & CON_ANYTIME))
- continue;
- if (con->flags & CON_EXTENDED)
- con->write(con, ext_text, ext_len);
- else
- con->write(con, text, len);
+/*
+ * Return a pointer to the dedicated counter for the CPU+context of the
+ * caller.
+ */
+static u8 *__printk_recursion_counter(void)
+{
+#ifdef CONFIG_HAVE_NMI
+ if (in_nmi()) {
+ if (printk_percpu_data_ready())
+ return this_cpu_ptr(&printk_count_nmi);
+ return &printk_count_nmi_early;
}
+#endif
+ if (printk_percpu_data_ready())
+ return this_cpu_ptr(&printk_count);
+ return &printk_count_early;
}
+/*
+ * Enter recursion tracking. Interrupts are disabled to simplify tracking.
+ * The caller must check the boolean return value to see if the recursion is
+ * allowed. On failure, interrupts are not disabled.
+ *
+ * @recursion_ptr must be a variable of type (u8 *) and is the same variable
+ * that is passed to printk_exit_irqrestore().
+ */
+#define printk_enter_irqsave(recursion_ptr, flags) \
+({ \
+ bool success = true; \
+ \
+ typecheck(u8 *, recursion_ptr); \
+ local_irq_save(flags); \
+ (recursion_ptr) = __printk_recursion_counter(); \
+ if (*(recursion_ptr) > PRINTK_MAX_RECURSION) { \
+ local_irq_restore(flags); \
+ success = false; \
+ } else { \
+ (*(recursion_ptr))++; \
+ } \
+ success; \
+})
+
+/* Exit recursion tracking, restoring interrupts. */
+#define printk_exit_irqrestore(recursion_ptr, flags) \
+ do { \
+ typecheck(u8 *, recursion_ptr); \
+ (*(recursion_ptr))--; \
+ local_irq_restore(flags); \
+ } while (0)
+
int printk_delay_msec __read_mostly;
-static inline void printk_delay(void)
+static inline void printk_delay(int level)
{
+ boot_delay_msec(level);
+
if (unlikely(printk_delay_msec)) {
int m = printk_delay_msec;
@@ -1834,277 +2158,324 @@ static inline void printk_delay(void)
static inline u32 printk_caller_id(void)
{
return in_task() ? task_pid_nr(current) :
- 0x80000000 + raw_smp_processor_id();
+ 0x80000000 + smp_processor_id();
}
-/*
- * Continuation lines are buffered, and not committed to the record buffer
- * until the line is complete, or a race forces it. The line fragments
- * though, are printed immediately to the consoles to ensure everything has
- * reached the console in case of a kernel crash.
- */
-static struct cont {
- char buf[LOG_LINE_MAX];
- size_t len; /* length == 0 means unused buffer */
- u32 caller_id; /* printk_caller_id() of first print */
- u64 ts_nsec; /* time of first print */
- u8 level; /* log level of first message */
- u8 facility; /* log facility of first message */
- enum log_flags flags; /* prefix, newline flags */
-} cont;
-
-static void cont_flush(void)
-{
- if (cont.len == 0)
- return;
-
- log_store(cont.caller_id, cont.facility, cont.level, cont.flags,
- cont.ts_nsec, NULL, 0, cont.buf, cont.len);
- cont.len = 0;
-}
-
-static bool cont_add(u32 caller_id, int facility, int level,
- enum log_flags flags, const char *text, size_t len)
+/**
+ * printk_parse_prefix - Parse level and control flags.
+ *
+ * @text: The terminated text message.
+ * @level: A pointer to the current level value, will be updated.
+ * @flags: A pointer to the current printk_info flags, will be updated.
+ *
+ * @level may be NULL if the caller is not interested in the parsed value.
+ * Otherwise the variable pointed to by @level must be set to
+ * LOGLEVEL_DEFAULT in order to be updated with the parsed value.
+ *
+ * @flags may be NULL if the caller is not interested in the parsed value.
+ * Otherwise the variable pointed to by @flags will be OR'd with the parsed
+ * value.
+ *
+ * Return: The length of the parsed level and control flags.
+ */
+u16 printk_parse_prefix(const char *text, int *level,
+ enum printk_info_flags *flags)
{
- /* If the line gets too long, split it up in separate records. */
- if (cont.len + len > sizeof(cont.buf)) {
- cont_flush();
- return false;
- }
+ u16 prefix_len = 0;
+ int kern_level;
- if (!cont.len) {
- cont.facility = facility;
- cont.level = level;
- cont.caller_id = caller_id;
- cont.ts_nsec = local_clock();
- cont.flags = flags;
- }
+ while (*text) {
+ kern_level = printk_get_level(text);
+ if (!kern_level)
+ break;
- memcpy(cont.buf + cont.len, text, len);
- cont.len += len;
+ switch (kern_level) {
+ case '0' ... '7':
+ if (level && *level == LOGLEVEL_DEFAULT)
+ *level = kern_level - '0';
+ break;
+ case 'c': /* KERN_CONT */
+ if (flags)
+ *flags |= LOG_CONT;
+ }
- // The original flags come from the first line,
- // but later continuations can add a newline.
- if (flags & LOG_NEWLINE) {
- cont.flags |= LOG_NEWLINE;
- cont_flush();
+ prefix_len += 2;
+ text += 2;
}
- return true;
+ return prefix_len;
}
-static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len)
+__printf(5, 0)
+static u16 printk_sprint(char *text, u16 size, int facility,
+ enum printk_info_flags *flags, const char *fmt,
+ va_list args)
{
- const u32 caller_id = printk_caller_id();
+ u16 text_len;
- /*
- * If an earlier line was buffered, and we're a continuation
- * write from the same context, try to add it to the buffer.
- */
- if (cont.len) {
- if (cont.caller_id == caller_id && (lflags & LOG_CONT)) {
- if (cont_add(caller_id, facility, level, lflags, text, text_len))
- return text_len;
- }
- /* Otherwise, make sure it's flushed */
- cont_flush();
+ text_len = vscnprintf(text, size, fmt, args);
+
+ /* Mark and strip a trailing newline. */
+ if (text_len && text[text_len - 1] == '\n') {
+ text_len--;
+ *flags |= LOG_NEWLINE;
}
- /* Skip empty continuation lines that couldn't be added - they just flush */
- if (!text_len && (lflags & LOG_CONT))
- return 0;
+ /* Strip log level and control flags. */
+ if (facility == 0) {
+ u16 prefix_len;
- /* If it doesn't end in a newline, try to buffer the current line */
- if (!(lflags & LOG_NEWLINE)) {
- if (cont_add(caller_id, facility, level, lflags, text, text_len))
- return text_len;
+ prefix_len = printk_parse_prefix(text, NULL, NULL);
+ if (prefix_len) {
+ text_len -= prefix_len;
+ memmove(text, text + prefix_len, text_len);
+ }
}
- /* Store it in the record log */
- return log_store(caller_id, facility, level, lflags, 0,
- dict, dictlen, text, text_len);
+ trace_console(text, text_len);
+
+ return text_len;
}
-/* Must be called under logbuf_lock. */
+__printf(4, 0)
int vprintk_store(int facility, int level,
- const char *dict, size_t dictlen,
+ const struct dev_printk_info *dev_info,
const char *fmt, va_list args)
{
- static char textbuf[LOG_LINE_MAX];
- char *text = textbuf;
- size_t text_len;
- enum log_flags lflags = 0;
+ struct prb_reserved_entry e;
+ enum printk_info_flags flags = 0;
+ struct printk_record r;
+ unsigned long irqflags;
+ u16 trunc_msg_len = 0;
+ char prefix_buf[8];
+ u8 *recursion_ptr;
+ u16 reserve_size;
+ va_list args2;
+ u32 caller_id;
+ u16 text_len;
+ int ret = 0;
+ u64 ts_nsec;
+
+ if (!printk_enter_irqsave(recursion_ptr, irqflags))
+ return 0;
/*
- * The printf needs to come first; we need the syslog
- * prefix which might be passed-in as a parameter.
+ * Since the duration of printk() can vary depending on the message
+ * and state of the ringbuffer, grab the timestamp now so that it is
+ * close to the call of printk(). This provides a more deterministic
+ * timestamp with respect to the caller.
*/
- text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
+ ts_nsec = local_clock();
- /* mark and strip a trailing newline */
- if (text_len && text[text_len-1] == '\n') {
- text_len--;
- lflags |= LOG_NEWLINE;
- }
+ caller_id = printk_caller_id();
- /* strip kernel syslog prefix and extract log level or control flags */
- if (facility == 0) {
- int kern_level;
+ /*
+ * The sprintf needs to come first since the syslog prefix might be
+ * passed in as a parameter. An extra byte must be reserved so that
+ * later the vscnprintf() into the reserved buffer has room for the
+ * terminating '\0', which is not counted by vsnprintf().
+ */
+ va_copy(args2, args);
+ reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1;
+ va_end(args2);
- while ((kern_level = printk_get_level(text)) != 0) {
- switch (kern_level) {
- case '0' ... '7':
- if (level == LOGLEVEL_DEFAULT)
- level = kern_level - '0';
- break;
- case 'c': /* KERN_CONT */
- lflags |= LOG_CONT;
+ if (reserve_size > PRINTKRB_RECORD_MAX)
+ reserve_size = PRINTKRB_RECORD_MAX;
+
+ /* Extract log level or control flags. */
+ if (facility == 0)
+ printk_parse_prefix(&prefix_buf[0], &level, &flags);
+
+ if (level == LOGLEVEL_DEFAULT)
+ level = default_message_loglevel;
+
+ if (dev_info)
+ flags |= LOG_NEWLINE;
+
+ if (is_printk_force_console())
+ flags |= LOG_FORCE_CON;
+
+ if (flags & LOG_CONT) {
+ prb_rec_init_wr(&r, reserve_size);
+ if (prb_reserve_in_last(&e, prb, &r, caller_id, PRINTKRB_RECORD_MAX)) {
+ text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size,
+ facility, &flags, fmt, args);
+ r.info->text_len += text_len;
+
+ if (flags & LOG_FORCE_CON)
+ r.info->flags |= LOG_FORCE_CON;
+
+ if (flags & LOG_NEWLINE) {
+ r.info->flags |= LOG_NEWLINE;
+ prb_final_commit(&e);
+ } else {
+ prb_commit(&e);
}
- text_len -= 2;
- text += 2;
+ ret = text_len;
+ goto out;
}
}
- if (level == LOGLEVEL_DEFAULT)
- level = default_message_loglevel;
+ /*
+ * Explicitly initialize the record before every prb_reserve() call.
+ * prb_reserve_in_last() and prb_reserve() purposely invalidate the
+ * structure when they fail.
+ */
+ prb_rec_init_wr(&r, reserve_size);
+ if (!prb_reserve(&e, prb, &r)) {
+ /* truncate the message if it is too long for empty buffer */
+ truncate_msg(&reserve_size, &trunc_msg_len);
+
+ prb_rec_init_wr(&r, reserve_size + trunc_msg_len);
+ if (!prb_reserve(&e, prb, &r))
+ goto out;
+ }
+
+ /* fill message */
+ text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &flags, fmt, args);
+ if (trunc_msg_len)
+ memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len);
+ r.info->text_len = text_len + trunc_msg_len;
+ r.info->facility = facility;
+ r.info->level = level & 7;
+ r.info->flags = flags & 0x1f;
+ r.info->ts_nsec = ts_nsec;
+ r.info->caller_id = caller_id;
+ if (dev_info)
+ memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info));
+
+ /* A message without a trailing newline can be continued. */
+ if (!(flags & LOG_NEWLINE))
+ prb_commit(&e);
+ else
+ prb_final_commit(&e);
+
+ ret = text_len + trunc_msg_len;
+out:
+ printk_exit_irqrestore(recursion_ptr, irqflags);
+ return ret;
+}
+
+/*
+ * This acts as a one-way switch to allow legacy consoles to print from
+ * the printk() caller context on a panic CPU. It also attempts to flush
+ * the legacy consoles in this context.
+ */
+void printk_legacy_allow_panic_sync(void)
+{
+ struct console_flush_type ft;
- if (dict)
- lflags |= LOG_NEWLINE;
+ legacy_allow_panic_sync = true;
- return log_output(facility, level, lflags,
- dict, dictlen, text, text_len);
+ printk_get_console_flush_type(&ft);
+ if (ft.legacy_direct) {
+ if (console_trylock())
+ console_unlock();
+ }
}
asmlinkage int vprintk_emit(int facility, int level,
- const char *dict, size_t dictlen,
+ const struct dev_printk_info *dev_info,
const char *fmt, va_list args)
{
+ struct console_flush_type ft;
int printed_len;
- bool in_sched = false, pending_output;
- unsigned long flags;
- u64 curr_log_seq;
/* Suppress unimportant messages after panic happens */
if (unlikely(suppress_printk))
return 0;
+ /*
+ * The messages on the panic CPU are the most important. If
+ * non-panic CPUs are generating any messages, they will be
+ * silently dropped.
+ */
+ if (other_cpu_in_panic() && !panic_triggering_all_cpu_backtrace)
+ return 0;
+
+ printk_get_console_flush_type(&ft);
+
+ /* If called from the scheduler, we can not call up(). */
if (level == LOGLEVEL_SCHED) {
level = LOGLEVEL_DEFAULT;
- in_sched = true;
+ ft.legacy_offload |= ft.legacy_direct;
+ ft.legacy_direct = false;
}
- boot_delay_msec(level);
- printk_delay();
+ printk_delay(level);
- /* This stops the holder of console_sem just where we want him */
- logbuf_lock_irqsave(flags);
- curr_log_seq = log_next_seq;
- printed_len = vprintk_store(facility, level, dict, dictlen, fmt, args);
- pending_output = (curr_log_seq != log_next_seq);
- logbuf_unlock_irqrestore(flags);
+ printed_len = vprintk_store(facility, level, dev_info, fmt, args);
- /* If called from the scheduler, we can not call up(). */
- if (!in_sched && pending_output) {
+ if (ft.nbcon_atomic)
+ nbcon_atomic_flush_pending();
+
+ if (ft.nbcon_offload)
+ nbcon_kthreads_wake();
+
+ if (ft.legacy_direct) {
/*
- * Disable preemption to avoid being preempted while holding
- * console_sem which would prevent anyone from printing to
- * console
+ * The caller may be holding system-critical or
+ * timing-sensitive locks. Disable preemption during
+ * printing of all remaining records to all consoles so that
+ * this context can return as soon as possible. Hopefully
+ * another printk() caller will take over the printing.
*/
preempt_disable();
/*
* Try to acquire and then immediately release the console
- * semaphore. The release will print out buffers and wake up
- * /dev/kmsg and syslog() users.
+ * semaphore. The release will print out buffers. With the
+ * spinning variant, this context tries to take over the
+ * printing from another printing context.
*/
if (console_trylock_spinning())
console_unlock();
preempt_enable();
}
- if (pending_output)
+ if (ft.legacy_offload)
+ defer_console_output();
+ else
wake_up_klogd();
+
return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);
-asmlinkage int vprintk(const char *fmt, va_list args)
-{
- return vprintk_func(fmt, args);
-}
-EXPORT_SYMBOL(vprintk);
-
int vprintk_default(const char *fmt, va_list args)
{
- return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
+ return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
}
EXPORT_SYMBOL_GPL(vprintk_default);
-/**
- * printk - print a kernel message
- * @fmt: format string
- *
- * This is printk(). It can be called from any context. We want it to work.
- *
- * We try to grab the console_lock. If we succeed, it's easy - we log the
- * output and call the console drivers. If we fail to get the semaphore, we
- * place the output into the log buffer and return. The current holder of
- * the console_sem will notice the new output in console_unlock(); and will
- * send it to the consoles before releasing the lock.
- *
- * One effect of this deferred printing is that code which calls printk() and
- * then changes console_loglevel may break. This is because console_loglevel
- * is inspected when the actual printing occurs.
- *
- * See also:
- * printf(3)
- *
- * See the vsnprintf() documentation for format string extensions over C99.
- */
-asmlinkage __visible int printk(const char *fmt, ...)
+asmlinkage __visible int _printk(const char *fmt, ...)
{
va_list args;
int r;
va_start(args, fmt);
- r = vprintk_func(fmt, args);
+ r = vprintk(fmt, args);
va_end(args);
return r;
}
-EXPORT_SYMBOL(printk);
+EXPORT_SYMBOL(_printk);
+
+static bool pr_flush(int timeout_ms, bool reset_on_progress);
+static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress);
#else /* CONFIG_PRINTK */
-#define LOG_LINE_MAX 0
-#define PREFIX_MAX 0
#define printk_time false
+#define prb_read_valid(rb, seq, r) false
+#define prb_first_valid_seq(rb) 0
+#define prb_next_seq(rb) 0
+
static u64 syslog_seq;
-static u32 syslog_idx;
-static u64 console_seq;
-static u32 console_idx;
-static u64 exclusive_console_stop_seq;
-static u64 log_first_seq;
-static u32 log_first_idx;
-static u64 log_next_seq;
-static char *log_text(const struct printk_log *msg) { return NULL; }
-static char *log_dict(const struct printk_log *msg) { return NULL; }
-static struct printk_log *log_from_idx(u32 idx) { return NULL; }
-static u32 log_next(u32 idx) { return 0; }
-static ssize_t msg_print_ext_header(char *buf, size_t size,
- struct printk_log *msg,
- u64 seq) { return 0; }
-static ssize_t msg_print_ext_body(char *buf, size_t size,
- char *dict, size_t dict_len,
- char *text, size_t text_len) { return 0; }
-static void console_lock_spinning_enable(void) { }
-static int console_lock_spinning_disable_and_check(void) { return 0; }
-static void call_console_drivers(const char *ext_text, size_t ext_len,
- const char *text, size_t len) {}
-static size_t msg_print_text(const struct printk_log *msg, bool syslog,
- bool time, char *buf, size_t size) { return 0; }
-static bool suppress_message_printing(int level) { return false; }
+
+static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; }
+static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; }
#endif /* CONFIG_PRINTK */
@@ -2128,24 +2499,51 @@ asmlinkage __visible void early_printk(const char *fmt, ...)
}
#endif
-static int __add_preferred_console(char *name, int idx, char *options,
+static void set_user_specified(struct console_cmdline *c, bool user_specified)
+{
+ if (!user_specified)
+ return;
+
+ /*
+ * @c console was defined by the user on the command line.
+ * Do not clear when added twice also by SPCR or the device tree.
+ */
+ c->user_specified = true;
+ /* At least one console defined by the user on the command line. */
+ console_set_on_cmdline = 1;
+}
+
+static int __add_preferred_console(const char *name, const short idx,
+ const char *devname, char *options,
char *brl_options, bool user_specified)
{
struct console_cmdline *c;
int i;
+ if (!name && !devname)
+ return -EINVAL;
+
+ /*
+ * We use a signed short index for struct console for device drivers to
+ * indicate a not yet assigned index or port. However, a negative index
+ * value is not valid when the console name and index are defined on
+ * the command line.
+ */
+ if (name && idx < 0)
+ return -EINVAL;
+
/*
* See if this tty is not yet registered, and
* if we have a slot free.
*/
for (i = 0, c = console_cmdline;
- i < MAX_CMDLINECONSOLES && c->name[0];
+ i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]);
i++, c++) {
- if (strcmp(c->name, name) == 0 && c->index == idx) {
+ if ((name && strcmp(c->name, name) == 0 && c->index == idx) ||
+ (devname && strcmp(c->devname, devname) == 0)) {
if (!brl_options)
preferred_console = i;
- if (user_specified)
- c->user_specified = true;
+ set_user_specified(c, user_specified);
return 0;
}
}
@@ -2153,9 +2551,12 @@ static int __add_preferred_console(char *name, int idx, char *options,
return -E2BIG;
if (!brl_options)
preferred_console = i;
- strlcpy(c->name, name, sizeof(c->name));
+ if (name)
+ strscpy(c->name, name);
+ if (devname)
+ strscpy(c->devname, devname);
c->options = options;
- c->user_specified = user_specified;
+ set_user_specified(c, user_specified);
braille_set_options(c, brl_options);
c->index = idx;
@@ -2178,43 +2579,66 @@ __setup("console_msg_format=", console_msg_format_setup);
*/
static int __init console_setup(char *str)
{
- char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */
- char *s, *options, *brl_options = NULL;
+ static_assert(sizeof(console_cmdline[0].devname) >= sizeof(console_cmdline[0].name) + 4);
+ char buf[sizeof(console_cmdline[0].devname)];
+ char *brl_options = NULL;
+ char *ttyname = NULL;
+ char *devname = NULL;
+ char *options;
+ char *s;
int idx;
- if (str[0] == 0)
+ /*
+ * console="" or console=null have been suggested as a way to
+ * disable console output. Use ttynull that has been created
+ * for exactly this purpose.
+ */
+ if (str[0] == 0 || strcmp(str, "null") == 0) {
+ __add_preferred_console("ttynull", 0, NULL, NULL, NULL, true);
return 1;
+ }
if (_braille_console_setup(&str, &brl_options))
return 1;
+ /* For a DEVNAME:0.0 style console the character device is unknown early */
+ if (strchr(str, ':'))
+ devname = buf;
+ else
+ ttyname = buf;
+
/*
* Decode str into name, index, options.
*/
- if (str[0] >= '0' && str[0] <= '9') {
- strcpy(buf, "ttyS");
- strncpy(buf + 4, str, sizeof(buf) - 5);
- } else {
- strncpy(buf, str, sizeof(buf) - 1);
- }
- buf[sizeof(buf) - 1] = 0;
+ if (ttyname && isdigit(str[0]))
+ scnprintf(buf, sizeof(buf), "ttyS%s", str);
+ else
+ strscpy(buf, str);
+
options = strchr(str, ',');
if (options)
*(options++) = 0;
+
#ifdef __sparc__
if (!strcmp(str, "ttya"))
- strcpy(buf, "ttyS0");
+ strscpy(buf, "ttyS0");
if (!strcmp(str, "ttyb"))
- strcpy(buf, "ttyS1");
+ strscpy(buf, "ttyS1");
#endif
+
for (s = buf; *s; s++)
- if (isdigit(*s) || *s == ',')
+ if ((ttyname && isdigit(*s)) || *s == ',')
break;
- idx = simple_strtoul(s, NULL, 10);
+
+ /* @idx will get defined when devname matches. */
+ if (devname)
+ idx = -1;
+ else
+ idx = simple_strtoul(s, NULL, 10);
+
*s = 0;
- __add_preferred_console(buf, idx, options, brl_options, true);
- console_set_on_cmdline = 1;
+ __add_preferred_console(ttyname, idx, devname, options, brl_options, true);
return 1;
}
__setup("console=", console_setup);
@@ -2232,10 +2656,55 @@ __setup("console=", console_setup);
* commonly to provide a default console (ie from PROM variables) when
* the user has not supplied one.
*/
-int add_preferred_console(char *name, int idx, char *options)
+int add_preferred_console(const char *name, const short idx, char *options)
+{
+ return __add_preferred_console(name, idx, NULL, options, NULL, false);
+}
+
+/**
+ * match_devname_and_update_preferred_console - Update a preferred console
+ * when matching devname is found.
+ * @devname: DEVNAME:0.0 style device name
+ * @name: Name of the corresponding console driver, e.g. "ttyS"
+ * @idx: Console index, e.g. port number.
+ *
+ * The function checks whether a device with the given @devname is
+ * preferred via the console=DEVNAME:0.0 command line option.
+ * It fills the missing console driver name and console index
+ * so that a later register_console() call could find (match)
+ * and enable this device.
+ *
+ * It might be used when a driver subsystem initializes particular
+ * devices with already known DEVNAME:0.0 style names. And it
+ * could predict which console driver name and index this device
+ * would later get associated with.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int match_devname_and_update_preferred_console(const char *devname,
+ const char *name,
+ const short idx)
{
- return __add_preferred_console(name, idx, options, NULL, false);
+ struct console_cmdline *c = console_cmdline;
+ int i;
+
+ if (!devname || !strlen(devname) || !name || !strlen(name) || idx < 0)
+ return -EINVAL;
+
+ for (i = 0; i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]);
+ i++, c++) {
+ if (!strcmp(devname, c->devname)) {
+ pr_info("associate the preferred console \"%s\" with \"%s%d\"\n",
+ devname, name, idx);
+ strscpy(c->name, name);
+ c->index = idx;
+ return 0;
+ }
+ }
+
+ return -ENOENT;
}
+EXPORT_SYMBOL_GPL(match_devname_and_update_preferred_console);
bool console_suspend_enabled = true;
EXPORT_SYMBOL(console_suspend_enabled);
@@ -2251,6 +2720,18 @@ module_param_named(console_suspend, console_suspend_enabled,
MODULE_PARM_DESC(console_suspend, "suspend console during suspend"
" and hibernate operations");
+static bool printk_console_no_auto_verbose;
+
+void console_verbose(void)
+{
+ if (console_loglevel && !printk_console_no_auto_verbose)
+ console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
+}
+EXPORT_SYMBOL_GPL(console_verbose);
+
+module_param_named(console_no_auto_verbose, printk_console_no_auto_verbose, bool, 0644);
+MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to highest on oops/panic/etc");
+
/**
* suspend_console - suspend the console subsystem
*
@@ -2258,21 +2739,54 @@ MODULE_PARM_DESC(console_suspend, "suspend console during suspend"
*/
void suspend_console(void)
{
+ struct console *con;
+
if (!console_suspend_enabled)
return;
pr_info("Suspending console(s) (use no_console_suspend to debug)\n");
- console_lock();
- console_suspended = 1;
- up_console_sem();
+ pr_flush(1000, true);
+
+ console_list_lock();
+ for_each_console(con)
+ console_srcu_write_flags(con, con->flags | CON_SUSPENDED);
+ console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. All printing
+ * contexts must be able to see that they are suspended so that it
+ * is guaranteed that all printing has stopped when this function
+ * completes.
+ */
+ synchronize_srcu(&console_srcu);
}
void resume_console(void)
{
+ struct console_flush_type ft;
+ struct console *con;
+
if (!console_suspend_enabled)
return;
- down_console_sem();
- console_suspended = 0;
- console_unlock();
+
+ console_list_lock();
+ for_each_console(con)
+ console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED);
+ console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. All printing
+ * contexts must be able to see they are no longer suspended so
+ * that they are guaranteed to wake up and resume printing.
+ */
+ synchronize_srcu(&console_srcu);
+
+ printk_get_console_flush_type(&ft);
+ if (ft.nbcon_offload)
+ nbcon_kthreads_wake();
+ if (ft.legacy_offload)
+ defer_console_output();
+
+ pr_flush(1000, true);
}
/**
@@ -2286,19 +2800,25 @@ void resume_console(void)
*/
static int console_cpu_notify(unsigned int cpu)
{
+ struct console_flush_type ft;
+
if (!cpuhp_tasks_frozen) {
- /* If trylock fails, someone else is doing the printing */
- if (console_trylock())
- console_unlock();
+ printk_get_console_flush_type(&ft);
+ if (ft.nbcon_atomic)
+ nbcon_atomic_flush_pending();
+ if (ft.legacy_direct) {
+ if (console_trylock())
+ console_unlock();
+ }
}
return 0;
}
/**
- * console_lock - lock the console system for exclusive use.
+ * console_lock - block the console subsystem from printing
*
- * Acquires a lock which guarantees that the caller has
- * exclusive access to the console system and the console_drivers list.
+ * Acquires a lock which guarantees that no consoles will
+ * be in or enter their write() callback.
*
* Can sleep, returns nothing.
*/
@@ -2306,30 +2826,31 @@ void console_lock(void)
{
might_sleep();
+ /* On panic, the console_lock must be left to the panic cpu. */
+ while (other_cpu_in_panic())
+ msleep(1000);
+
down_console_sem();
- if (console_suspended)
- return;
console_locked = 1;
console_may_schedule = 1;
}
EXPORT_SYMBOL(console_lock);
/**
- * console_trylock - try to lock the console system for exclusive use.
+ * console_trylock - try to block the console subsystem from printing
*
- * Try to acquire a lock which guarantees that the caller has exclusive
- * access to the console system and the console_drivers list.
+ * Try to acquire a lock which guarantees that no consoles will
+ * be in or enter their write() callback.
*
* returns 1 on success, and 0 on failure to acquire the lock.
*/
int console_trylock(void)
{
- if (down_trylock_console_sem())
+ /* On panic, the console_lock must be left to the panic cpu. */
+ if (other_cpu_in_panic())
return 0;
- if (console_suspended) {
- up_console_sem();
+ if (down_trylock_console_sem())
return 0;
- }
console_locked = 1;
console_may_schedule = 0;
return 1;
@@ -2342,187 +2863,452 @@ int is_console_locked(void)
}
EXPORT_SYMBOL(is_console_locked);
+static void __console_unlock(void)
+{
+ console_locked = 0;
+ up_console_sem();
+}
+
+#ifdef CONFIG_PRINTK
+
/*
- * Check if we have any console that is capable of printing while cpu is
- * booting or shutting down. Requires console_sem.
+ * Prepend the message in @pmsg->pbufs->outbuf. This is achieved by shifting
+ * the existing message over and inserting the scratchbuf message.
+ *
+ * @pmsg is the original printk message.
+ * @fmt is the printf format of the message which will prepend the existing one.
+ *
+ * If there is not enough space in @pmsg->pbufs->outbuf, the existing
+ * message text will be sufficiently truncated.
+ *
+ * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated.
*/
-static int have_callable_console(void)
+__printf(2, 3)
+static void console_prepend_message(struct printk_message *pmsg, const char *fmt, ...)
{
- struct console *con;
+ struct printk_buffers *pbufs = pmsg->pbufs;
+ const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
+ const size_t outbuf_sz = sizeof(pbufs->outbuf);
+ char *scratchbuf = &pbufs->scratchbuf[0];
+ char *outbuf = &pbufs->outbuf[0];
+ va_list args;
+ size_t len;
- for_each_console(con)
- if ((con->flags & CON_ENABLED) &&
- (con->flags & CON_ANYTIME))
- return 1;
+ va_start(args, fmt);
+ len = vscnprintf(scratchbuf, scratchbuf_sz, fmt, args);
+ va_end(args);
- return 0;
+ /*
+ * Make sure outbuf is sufficiently large before prepending.
+ * Keep at least the prefix when the message must be truncated.
+ * It is a rather theoretical problem when someone tries to
+ * use a minimalist buffer.
+ */
+ if (WARN_ON_ONCE(len + PRINTK_PREFIX_MAX >= outbuf_sz))
+ return;
+
+ if (pmsg->outbuf_len + len >= outbuf_sz) {
+ /* Truncate the message, but keep it terminated. */
+ pmsg->outbuf_len = outbuf_sz - (len + 1);
+ outbuf[pmsg->outbuf_len] = 0;
+ }
+
+ memmove(outbuf + len, outbuf, pmsg->outbuf_len + 1);
+ memcpy(outbuf, scratchbuf, len);
+ pmsg->outbuf_len += len;
}
/*
- * Can we actually use the console at this time on this cpu?
+ * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message".
+ * @pmsg->outbuf_len is updated appropriately.
*
- * Console drivers may assume that per-cpu resources have been allocated. So
- * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
- * call them until this CPU is officially up.
+ * @pmsg is the printk message to prepend.
+ *
+ * @dropped is the dropped count to report in the dropped message.
*/
-static inline int can_use_console(void)
+void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
{
- return cpu_online(raw_smp_processor_id()) || have_callable_console();
+ console_prepend_message(pmsg, "** %lu printk messages dropped **\n", dropped);
}
-/**
- * console_unlock - unlock the console system
+/*
+ * Prepend the message in @pmsg->pbufs->outbuf with a "replay message".
+ * @pmsg->outbuf_len is updated appropriately.
+ *
+ * @pmsg is the printk message to prepend.
+ */
+void console_prepend_replay(struct printk_message *pmsg)
+{
+ console_prepend_message(pmsg, "** replaying previous printk message **\n");
+}
+
+/*
+ * Read and format the specified record (or a later record if the specified
+ * record is not available).
*
- * Releases the console_lock which the caller holds on the console system
- * and the console driver list.
+ * @pmsg will contain the formatted result. @pmsg->pbufs must point to a
+ * struct printk_buffers.
*
- * While the console_lock was held, console output may have been buffered
- * by printk(). If this is the case, console_unlock(); emits
- * the output prior to releasing the lock.
+ * @seq is the record to read and format. If it is not available, the next
+ * valid record is read.
*
- * If there is output waiting, we wake /dev/kmsg and syslog() users.
+ * @is_extended specifies if the message should be formatted for extended
+ * console output.
*
- * console_unlock(); may be called from any context.
+ * @may_supress specifies if records may be skipped based on loglevel.
+ *
+ * Returns false if no record is available. Otherwise true and all fields
+ * of @pmsg are valid. (See the documentation of struct printk_message
+ * for information about the @pmsg fields.)
*/
-void console_unlock(void)
+bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
+ bool is_extended, bool may_suppress)
{
- static char ext_text[CONSOLE_EXT_LOG_MAX];
- static char text[LOG_LINE_MAX + PREFIX_MAX];
- unsigned long flags;
- bool do_cond_resched, retry;
-
- if (console_suspended) {
- up_console_sem();
- return;
- }
+ struct printk_buffers *pbufs = pmsg->pbufs;
+ const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
+ const size_t outbuf_sz = sizeof(pbufs->outbuf);
+ char *scratchbuf = &pbufs->scratchbuf[0];
+ char *outbuf = &pbufs->outbuf[0];
+ struct printk_info info;
+ struct printk_record r;
+ size_t len = 0;
+ bool force_con;
/*
- * Console drivers are called with interrupts disabled, so
- * @console_may_schedule should be cleared before; however, we may
- * end up dumping a lot of lines, for example, if called from
- * console registration path, and should invoke cond_resched()
- * between lines if allowable. Not doing so can cause a very long
- * scheduling stall on a slow console leading to RCU stall and
- * softlockup warnings which exacerbate the issue with more
- * messages practically incapacitating the system.
+ * Formatting extended messages requires a separate buffer, so use the
+ * scratch buffer to read in the ringbuffer text.
*
- * console_trylock() is not able to detect the preemptive
- * context reliably. Therefore the value must be stored before
- * and cleared after the the "again" goto label.
+ * Formatting normal messages is done in-place, so read the ringbuffer
+ * text directly into the output buffer.
*/
- do_cond_resched = console_may_schedule;
-again:
- console_may_schedule = 0;
+ if (is_extended)
+ prb_rec_init_rd(&r, &info, scratchbuf, scratchbuf_sz);
+ else
+ prb_rec_init_rd(&r, &info, outbuf, outbuf_sz);
+
+ if (!prb_read_valid(prb, seq, &r))
+ return false;
+
+ pmsg->seq = r.info->seq;
+ pmsg->dropped = r.info->seq - seq;
+ force_con = r.info->flags & LOG_FORCE_CON;
/*
- * We released the console_sem lock, so we need to recheck if
- * cpu is online and (if not) is there at least one CON_ANYTIME
- * console.
+ * Skip records that are not forced to be printed on consoles and that
+ * has level above the console loglevel.
*/
- if (!can_use_console()) {
- console_locked = 0;
- up_console_sem();
- return;
+ if (!force_con && may_suppress && suppress_message_printing(r.info->level))
+ goto out;
+
+ if (is_extended) {
+ len = info_print_ext_header(outbuf, outbuf_sz, r.info);
+ len += msg_print_ext_body(outbuf + len, outbuf_sz - len,
+ &r.text_buf[0], r.info->text_len, &r.info->dev_info);
+ } else {
+ len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time);
}
+out:
+ pmsg->outbuf_len = len;
+ return true;
+}
- for (;;) {
- struct printk_log *msg;
- size_t ext_len = 0;
- size_t len;
+/*
+ * Legacy console printing from printk() caller context does not respect
+ * raw_spinlock/spinlock nesting. For !PREEMPT_RT the lockdep warning is a
+ * false positive. For PREEMPT_RT the false positive condition does not
+ * occur.
+ *
+ * This map is used to temporarily establish LD_WAIT_SLEEP context for the
+ * console write() callback when legacy printing to avoid false positive
+ * lockdep complaints, thus allowing lockdep to continue to function for
+ * real issues.
+ */
+#ifdef CONFIG_PREEMPT_RT
+static inline void printk_legacy_allow_spinlock_enter(void) { }
+static inline void printk_legacy_allow_spinlock_exit(void) { }
+#else
+static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_SLEEP);
- printk_safe_enter_irqsave(flags);
- raw_spin_lock(&logbuf_lock);
- if (console_seq < log_first_seq) {
- len = snprintf(text, sizeof(text),
- "** %llu printk messages dropped **\n",
- log_first_seq - console_seq);
+static inline void printk_legacy_allow_spinlock_enter(void)
+{
+ lock_map_acquire_try(&printk_legacy_map);
+}
- /* messages are gone, move to first one */
- console_seq = log_first_seq;
- console_idx = log_first_idx;
- } else {
- len = 0;
- }
-skip:
- if (console_seq == log_next_seq)
- break;
+static inline void printk_legacy_allow_spinlock_exit(void)
+{
+ lock_map_release(&printk_legacy_map);
+}
+#endif /* CONFIG_PREEMPT_RT */
- msg = log_from_idx(console_idx);
- if (suppress_message_printing(msg->level)) {
- /*
- * Skip record we have buffered and already printed
- * directly to the console when we received it, and
- * record that has level above the console loglevel.
- */
- console_idx = log_next(console_idx);
- console_seq++;
- goto skip;
- }
+/*
+ * Used as the printk buffers for non-panic, serialized console printing.
+ * This is for legacy (!CON_NBCON) as well as all boot (CON_BOOT) consoles.
+ * Its usage requires the console_lock held.
+ */
+struct printk_buffers printk_shared_pbufs;
- /* Output to all consoles once old messages replayed. */
- if (unlikely(exclusive_console &&
- console_seq >= exclusive_console_stop_seq)) {
- exclusive_console = NULL;
- }
+/*
+ * Print one record for the given console. The record printed is whatever
+ * record is the next available record for the given console.
+ *
+ * @handover will be set to true if a printk waiter has taken over the
+ * console_lock, in which case the caller is no longer holding both the
+ * console_lock and the SRCU read lock. Otherwise it is set to false.
+ *
+ * @cookie is the cookie from the SRCU read lock.
+ *
+ * Returns false if the given console has no next record to print, otherwise
+ * true.
+ *
+ * Requires the console_lock and the SRCU read lock.
+ */
+static bool console_emit_next_record(struct console *con, bool *handover, int cookie)
+{
+ bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
+ char *outbuf = &printk_shared_pbufs.outbuf[0];
+ struct printk_message pmsg = {
+ .pbufs = &printk_shared_pbufs,
+ };
+ unsigned long flags;
- len += msg_print_text(msg,
- console_msg_format & MSG_FORMAT_SYSLOG,
- printk_time, text + len, sizeof(text) - len);
- if (nr_ext_console_drivers) {
- ext_len = msg_print_ext_header(ext_text,
- sizeof(ext_text),
- msg, console_seq);
- ext_len += msg_print_ext_body(ext_text + ext_len,
- sizeof(ext_text) - ext_len,
- log_dict(msg), msg->dict_len,
- log_text(msg), msg->text_len);
- }
- console_idx = log_next(console_idx);
- console_seq++;
- raw_spin_unlock(&logbuf_lock);
+ *handover = false;
+
+ if (!printk_get_next_message(&pmsg, con->seq, is_extended, true))
+ return false;
+
+ con->dropped += pmsg.dropped;
+
+ /* Skip messages of formatted length 0. */
+ if (pmsg.outbuf_len == 0) {
+ con->seq = pmsg.seq + 1;
+ goto skip;
+ }
+ if (con->dropped && !is_extended) {
+ console_prepend_dropped(&pmsg, con->dropped);
+ con->dropped = 0;
+ }
+
+ /* Write everything out to the hardware. */
+
+ if (force_legacy_kthread() && !panic_in_progress()) {
+ /*
+ * With forced threading this function is in a task context
+ * (either legacy kthread or get_init_console_seq()). There
+ * is no need for concern about printk reentrance, handovers,
+ * or lockdep complaints.
+ */
+
+ con->write(con, outbuf, pmsg.outbuf_len);
+ con->seq = pmsg.seq + 1;
+ } else {
/*
* While actively printing out messages, if another printk()
* were to occur on another CPU, it may wait for this one to
* finish. This task can not be preempted if there is a
* waiter waiting to take over.
+ *
+ * Interrupts are disabled because the hand over to a waiter
+ * must not be interrupted until the hand over is completed
+ * (@console_waiter is cleared).
*/
+ printk_safe_enter_irqsave(flags);
console_lock_spinning_enable();
- stop_critical_timings(); /* don't trace print latency */
- call_console_drivers(ext_text, ext_len, text, len);
+ /* Do not trace print latency. */
+ stop_critical_timings();
+
+ printk_legacy_allow_spinlock_enter();
+ con->write(con, outbuf, pmsg.outbuf_len);
+ printk_legacy_allow_spinlock_exit();
+
start_critical_timings();
- if (console_lock_spinning_disable_and_check()) {
- printk_safe_exit_irqrestore(flags);
- return;
- }
+ con->seq = pmsg.seq + 1;
+ *handover = console_lock_spinning_disable_and_check(cookie);
printk_safe_exit_irqrestore(flags);
-
- if (do_cond_resched)
- cond_resched();
}
+skip:
+ return true;
+}
- console_locked = 0;
+#else
+
+static bool console_emit_next_record(struct console *con, bool *handover, int cookie)
+{
+ *handover = false;
+ return false;
+}
- raw_spin_unlock(&logbuf_lock);
+static inline void printk_kthreads_check_locked(void) { }
- up_console_sem();
+#endif /* CONFIG_PRINTK */
+
+/*
+ * Print out all remaining records to all consoles.
+ *
+ * @do_cond_resched is set by the caller. It can be true only in schedulable
+ * context.
+ *
+ * @next_seq is set to the sequence number after the last available record.
+ * The value is valid only when this function returns true. It means that all
+ * usable consoles are completely flushed.
+ *
+ * @handover will be set to true if a printk waiter has taken over the
+ * console_lock, in which case the caller is no longer holding the
+ * console_lock. Otherwise it is set to false.
+ *
+ * Returns true when there was at least one usable console and all messages
+ * were flushed to all usable consoles. A returned false informs the caller
+ * that everything was not flushed (either there were no usable consoles or
+ * another context has taken over printing or it is a panic situation and this
+ * is not the panic CPU). Regardless the reason, the caller should assume it
+ * is not useful to immediately try again.
+ *
+ * Requires the console_lock.
+ */
+static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover)
+{
+ struct console_flush_type ft;
+ bool any_usable = false;
+ struct console *con;
+ bool any_progress;
+ int cookie;
+
+ *next_seq = 0;
+ *handover = false;
+
+ do {
+ any_progress = false;
+
+ printk_get_console_flush_type(&ft);
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(con) {
+ short flags = console_srcu_read_flags(con);
+ u64 printk_seq;
+ bool progress;
+
+ /*
+ * console_flush_all() is only responsible for nbcon
+ * consoles when the nbcon consoles cannot print via
+ * their atomic or threaded flushing.
+ */
+ if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload))
+ continue;
+
+ if (!console_is_usable(con, flags, !do_cond_resched))
+ continue;
+ any_usable = true;
+
+ if (flags & CON_NBCON) {
+ progress = nbcon_legacy_emit_next_record(con, handover, cookie,
+ !do_cond_resched);
+ printk_seq = nbcon_seq_read(con);
+ } else {
+ progress = console_emit_next_record(con, handover, cookie);
+ printk_seq = con->seq;
+ }
+
+ /*
+ * If a handover has occurred, the SRCU read lock
+ * is already released.
+ */
+ if (*handover)
+ return false;
+
+ /* Track the next of the highest seq flushed. */
+ if (printk_seq > *next_seq)
+ *next_seq = printk_seq;
+
+ if (!progress)
+ continue;
+ any_progress = true;
+
+ /* Allow panic_cpu to take over the consoles safely. */
+ if (other_cpu_in_panic())
+ goto abandon;
+
+ if (do_cond_resched)
+ cond_resched();
+ }
+ console_srcu_read_unlock(cookie);
+ } while (any_progress);
+
+ return any_usable;
+
+abandon:
+ console_srcu_read_unlock(cookie);
+ return false;
+}
+
+static void __console_flush_and_unlock(void)
+{
+ bool do_cond_resched;
+ bool handover;
+ bool flushed;
+ u64 next_seq;
/*
- * Someone could have filled up the buffer again, so re-check if there's
- * something to flush. In case we cannot trylock the console_sem again,
- * there's a new owner and the console_unlock() from them will do the
- * flush, no worries.
+ * Console drivers are called with interrupts disabled, so
+ * @console_may_schedule should be cleared before; however, we may
+ * end up dumping a lot of lines, for example, if called from
+ * console registration path, and should invoke cond_resched()
+ * between lines if allowable. Not doing so can cause a very long
+ * scheduling stall on a slow console leading to RCU stall and
+ * softlockup warnings which exacerbate the issue with more
+ * messages practically incapacitating the system. Therefore, create
+ * a local to use for the printing loop.
*/
- raw_spin_lock(&logbuf_lock);
- retry = console_seq != log_next_seq;
- raw_spin_unlock(&logbuf_lock);
- printk_safe_exit_irqrestore(flags);
+ do_cond_resched = console_may_schedule;
- if (retry && console_trylock())
- goto again;
+ do {
+ console_may_schedule = 0;
+
+ flushed = console_flush_all(do_cond_resched, &next_seq, &handover);
+ if (!handover)
+ __console_unlock();
+
+ /*
+ * Abort if there was a failure to flush all messages to all
+ * usable consoles. Either it is not possible to flush (in
+ * which case it would be an infinite loop of retrying) or
+ * another context has taken over printing.
+ */
+ if (!flushed)
+ break;
+
+ /*
+ * Some context may have added new records after
+ * console_flush_all() but before unlocking the console.
+ * Re-check if there is a new record to flush. If the trylock
+ * fails, another context is already handling the printing.
+ */
+ } while (prb_read_valid(prb, next_seq, NULL) && console_trylock());
+}
+
+/**
+ * console_unlock - unblock the legacy console subsystem from printing
+ *
+ * Releases the console_lock which the caller holds to block printing of
+ * the legacy console subsystem.
+ *
+ * While the console_lock was held, console output may have been buffered
+ * by printk(). If this is the case, console_unlock() emits the output on
+ * legacy consoles prior to releasing the lock.
+ *
+ * console_unlock(); may be called from any context.
+ */
+void console_unlock(void)
+{
+ struct console_flush_type ft;
+
+ printk_get_console_flush_type(&ft);
+ if (ft.legacy_direct)
+ __console_flush_and_unlock();
+ else
+ __console_unlock();
}
EXPORT_SYMBOL(console_unlock);
@@ -2544,13 +3330,45 @@ EXPORT_SYMBOL(console_conditional_schedule);
void console_unblank(void)
{
+ bool found_unblank = false;
struct console *c;
+ int cookie;
+
+ /*
+ * First check if there are any consoles implementing the unblank()
+ * callback. If not, there is no reason to continue and take the
+ * console lock, which in particular can be dangerous if
+ * @oops_in_progress is set.
+ */
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) {
+ found_unblank = true;
+ break;
+ }
+ }
+ console_srcu_read_unlock(cookie);
+ if (!found_unblank)
+ return;
/*
- * console_unblank can no longer be called in interrupt context unless
- * oops_in_progress is set to 1..
+ * Stop console printing because the unblank() callback may
+ * assume the console is not within its write() callback.
+ *
+ * If @oops_in_progress is set, this may be an atomic context.
+ * In that case, attempt a trylock as best-effort.
*/
if (oops_in_progress) {
+ /* Semaphores are not NMI-safe. */
+ if (in_nmi())
+ return;
+
+ /*
+ * Attempting to trylock the console lock can deadlock
+ * if another CPU was stopped while modifying the
+ * semaphore. "Hope and pray" that this is not the
+ * current situation.
+ */
if (down_trylock_console_sem() != 0)
return;
} else
@@ -2558,10 +3376,52 @@ void console_unblank(void)
console_locked = 1;
console_may_schedule = 0;
- for_each_console(c)
- if ((c->flags & CON_ENABLED) && c->unblank)
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank)
c->unblank();
+ }
+ console_srcu_read_unlock(cookie);
+
console_unlock();
+
+ if (!oops_in_progress)
+ pr_flush(1000, true);
+}
+
+/*
+ * Rewind all consoles to the oldest available record.
+ *
+ * IMPORTANT: The function is safe only when called under
+ * console_lock(). It is not enforced because
+ * it is used as a best effort in panic().
+ */
+static void __console_rewind_all(void)
+{
+ struct console *c;
+ short flags;
+ int cookie;
+ u64 seq;
+
+ seq = prb_first_valid_seq(prb);
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ flags = console_srcu_read_flags(c);
+
+ if (flags & CON_NBCON) {
+ nbcon_seq_force(c, seq);
+ } else {
+ /*
+ * This assignment is safe only when called under
+ * console_lock(). On panic, legacy consoles are
+ * only best effort.
+ */
+ c->seq = seq;
+ }
+ }
+ console_srcu_read_unlock(cookie);
}
/**
@@ -2572,25 +3432,37 @@ void console_unblank(void)
*/
void console_flush_on_panic(enum con_flush_mode mode)
{
+ struct console_flush_type ft;
+ bool handover;
+ u64 next_seq;
+
/*
- * If someone else is holding the console lock, trylock will fail
- * and may_schedule may be set. Ignore and proceed to unlock so
- * that messages are flushed out. As this can be called from any
- * context and we don't want to get preempted while flushing,
- * ensure may_schedule is cleared.
+ * Ignore the console lock and flush out the messages. Attempting a
+ * trylock would not be useful because:
+ *
+ * - if it is contended, it must be ignored anyway
+ * - console_lock() and console_trylock() block and fail
+ * respectively in panic for non-panic CPUs
+ * - semaphores are not NMI-safe
+ */
+
+ /*
+ * If another context is holding the console lock,
+ * @console_may_schedule might be set. Clear it so that
+ * this context does not call cond_resched() while flushing.
*/
- console_trylock();
console_may_schedule = 0;
- if (mode == CONSOLE_REPLAY_ALL) {
- unsigned long flags;
+ if (mode == CONSOLE_REPLAY_ALL)
+ __console_rewind_all();
- logbuf_lock_irqsave(flags);
- console_seq = log_first_seq;
- console_idx = log_first_idx;
- logbuf_unlock_irqrestore(flags);
- }
- console_unlock();
+ printk_get_console_flush_type(&ft);
+ if (ft.nbcon_atomic)
+ nbcon_atomic_flush_pending();
+
+ /* Flush legacy consoles once allowed, even when dangerous. */
+ if (legacy_allow_panic_sync)
+ console_flush_all(false, &next_seq, &handover);
}
/*
@@ -2600,15 +3472,25 @@ struct tty_driver *console_device(int *index)
{
struct console *c;
struct tty_driver *driver = NULL;
+ int cookie;
+ /*
+ * Take console_lock to serialize device() callback with
+ * other console operations. For example, fg_console is
+ * modified under console_lock when switching vt.
+ */
console_lock();
- for_each_console(c) {
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
if (!c->device)
continue;
driver = c->device(c, index);
if (driver)
break;
}
+ console_srcu_read_unlock(cookie);
+
console_unlock();
return driver;
}
@@ -2620,20 +3502,253 @@ struct tty_driver *console_device(int *index)
*/
void console_stop(struct console *console)
{
- console_lock();
- console->flags &= ~CON_ENABLED;
- console_unlock();
+ __pr_flush(console, 1000, true);
+ console_list_lock();
+ console_srcu_write_flags(console, console->flags & ~CON_ENABLED);
+ console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. All contexts must
+ * be able to see that this console is disabled so that (for example)
+ * the caller can suspend the port without risk of another context
+ * using the port.
+ */
+ synchronize_srcu(&console_srcu);
}
EXPORT_SYMBOL(console_stop);
void console_start(struct console *console)
{
- console_lock();
- console->flags |= CON_ENABLED;
- console_unlock();
+ struct console_flush_type ft;
+ bool is_nbcon;
+
+ console_list_lock();
+ console_srcu_write_flags(console, console->flags | CON_ENABLED);
+ is_nbcon = console->flags & CON_NBCON;
+ console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. The related
+ * printing context must be able to see it is enabled so that
+ * it is guaranteed to wake up and resume printing.
+ */
+ synchronize_srcu(&console_srcu);
+
+ printk_get_console_flush_type(&ft);
+ if (is_nbcon && ft.nbcon_offload)
+ nbcon_kthread_wake(console);
+ else if (ft.legacy_offload)
+ defer_console_output();
+
+ __pr_flush(console, 1000, true);
}
EXPORT_SYMBOL(console_start);
+#ifdef CONFIG_PRINTK
+static int unregister_console_locked(struct console *console);
+
+/* True when system boot is far enough to create printer threads. */
+static bool printk_kthreads_ready __ro_after_init;
+
+static struct task_struct *printk_legacy_kthread;
+
+static bool legacy_kthread_should_wakeup(void)
+{
+ struct console_flush_type ft;
+ struct console *con;
+ bool ret = false;
+ int cookie;
+
+ if (kthread_should_stop())
+ return true;
+
+ printk_get_console_flush_type(&ft);
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(con) {
+ short flags = console_srcu_read_flags(con);
+ u64 printk_seq;
+
+ /*
+ * The legacy printer thread is only responsible for nbcon
+ * consoles when the nbcon consoles cannot print via their
+ * atomic or threaded flushing.
+ */
+ if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload))
+ continue;
+
+ if (!console_is_usable(con, flags, false))
+ continue;
+
+ if (flags & CON_NBCON) {
+ printk_seq = nbcon_seq_read(con);
+ } else {
+ /*
+ * It is safe to read @seq because only this
+ * thread context updates @seq.
+ */
+ printk_seq = con->seq;
+ }
+
+ if (prb_read_valid(prb, printk_seq, NULL)) {
+ ret = true;
+ break;
+ }
+ }
+ console_srcu_read_unlock(cookie);
+
+ return ret;
+}
+
+static int legacy_kthread_func(void *unused)
+{
+ for (;;) {
+ wait_event_interruptible(legacy_wait, legacy_kthread_should_wakeup());
+
+ if (kthread_should_stop())
+ break;
+
+ console_lock();
+ __console_flush_and_unlock();
+ }
+
+ return 0;
+}
+
+static bool legacy_kthread_create(void)
+{
+ struct task_struct *kt;
+
+ lockdep_assert_console_list_lock_held();
+
+ kt = kthread_run(legacy_kthread_func, NULL, "pr/legacy");
+ if (WARN_ON(IS_ERR(kt))) {
+ pr_err("failed to start legacy printing thread\n");
+ return false;
+ }
+
+ printk_legacy_kthread = kt;
+
+ /*
+ * It is important that console printing threads are scheduled
+ * shortly after a printk call and with generous runtime budgets.
+ */
+ sched_set_normal(printk_legacy_kthread, -20);
+
+ return true;
+}
+
+/**
+ * printk_kthreads_shutdown - shutdown all threaded printers
+ *
+ * On system shutdown all threaded printers are stopped. This allows printk
+ * to transition back to atomic printing, thus providing a robust mechanism
+ * for the final shutdown/reboot messages to be output.
+ */
+static void printk_kthreads_shutdown(void)
+{
+ struct console *con;
+
+ console_list_lock();
+ if (printk_kthreads_running) {
+ printk_kthreads_running = false;
+
+ for_each_console(con) {
+ if (con->flags & CON_NBCON)
+ nbcon_kthread_stop(con);
+ }
+
+ /*
+ * The threads may have been stopped while printing a
+ * backlog. Flush any records left over.
+ */
+ nbcon_atomic_flush_pending();
+ }
+ console_list_unlock();
+}
+
+static struct syscore_ops printk_syscore_ops = {
+ .shutdown = printk_kthreads_shutdown,
+};
+
+/*
+ * If appropriate, start nbcon kthreads and set @printk_kthreads_running.
+ * If any kthreads fail to start, those consoles are unregistered.
+ *
+ * Must be called under console_list_lock().
+ */
+static void printk_kthreads_check_locked(void)
+{
+ struct hlist_node *tmp;
+ struct console *con;
+
+ lockdep_assert_console_list_lock_held();
+
+ if (!printk_kthreads_ready)
+ return;
+
+ if (have_legacy_console || have_boot_console) {
+ if (!printk_legacy_kthread &&
+ force_legacy_kthread() &&
+ !legacy_kthread_create()) {
+ /*
+ * All legacy consoles must be unregistered. If there
+ * are any nbcon consoles, they will set up their own
+ * kthread.
+ */
+ hlist_for_each_entry_safe(con, tmp, &console_list, node) {
+ if (con->flags & CON_NBCON)
+ continue;
+
+ unregister_console_locked(con);
+ }
+ }
+ } else if (printk_legacy_kthread) {
+ kthread_stop(printk_legacy_kthread);
+ printk_legacy_kthread = NULL;
+ }
+
+ /*
+ * Printer threads cannot be started as long as any boot console is
+ * registered because there is no way to synchronize the hardware
+ * registers between boot console code and regular console code.
+ * It can only be known that there will be no new boot consoles when
+ * an nbcon console is registered.
+ */
+ if (have_boot_console || !have_nbcon_console) {
+ /* Clear flag in case all nbcon consoles unregistered. */
+ printk_kthreads_running = false;
+ return;
+ }
+
+ if (printk_kthreads_running)
+ return;
+
+ hlist_for_each_entry_safe(con, tmp, &console_list, node) {
+ if (!(con->flags & CON_NBCON))
+ continue;
+
+ if (!nbcon_kthread_create(con))
+ unregister_console_locked(con);
+ }
+
+ printk_kthreads_running = true;
+}
+
+static int __init printk_set_kthreads_ready(void)
+{
+ register_syscore_ops(&printk_syscore_ops);
+
+ console_list_lock();
+ printk_kthreads_ready = true;
+ printk_kthreads_check_locked();
+ console_list_unlock();
+
+ return 0;
+}
+early_initcall(printk_set_kthreads_ready);
+#endif /* CONFIG_PRINTK */
+
static int __read_mostly keep_bootcon;
static int __init keep_bootcon_setup(char *str)
@@ -2646,6 +3761,21 @@ static int __init keep_bootcon_setup(char *str)
early_param("keep_bootcon", keep_bootcon_setup);
+static int console_call_setup(struct console *newcon, char *options)
+{
+ int err;
+
+ if (!newcon->setup)
+ return 0;
+
+ /* Synchronize with possible boot console. */
+ console_lock();
+ err = newcon->setup(newcon, options);
+ console_unlock();
+
+ return err;
+}
+
/*
* This is called by register_console() to try to match
* the newly registered console with any of the ones selected
@@ -2655,14 +3785,18 @@ early_param("keep_bootcon", keep_bootcon_setup);
* Care need to be taken with consoles that are statically
* enabled such as netconsole
*/
-static int try_enable_new_console(struct console *newcon, bool user_specified)
+static int try_enable_preferred_console(struct console *newcon,
+ bool user_specified)
{
struct console_cmdline *c;
- int i;
+ int i, err;
for (i = 0, c = console_cmdline;
- i < MAX_CMDLINECONSOLES && c->name[0];
+ i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]);
i++, c++) {
+ /* Console not yet initialized? */
+ if (!c->name[0])
+ continue;
if (c->user_specified != user_specified)
continue;
if (!newcon->match ||
@@ -2680,22 +3814,20 @@ static int try_enable_new_console(struct console *newcon, bool user_specified)
if (_braille_register_console(newcon, c))
return 0;
- if (newcon->setup &&
- newcon->setup(newcon, c->options) != 0)
- return -EIO;
+ err = console_call_setup(newcon, c->options);
+ if (err)
+ return err;
}
newcon->flags |= CON_ENABLED;
- if (i == preferred_console) {
+ if (i == preferred_console)
newcon->flags |= CON_CONSDEV;
- has_preferred_console = true;
- }
return 0;
}
/*
* Some consoles, such as pstore and netconsole, can be enabled even
* without matching. Accept the pre-enabled consoles only when match()
- * and setup() had a change to be called.
+ * and setup() had a chance to be called.
*/
if (newcon->flags & CON_ENABLED && c->user_specified == user_specified)
return 0;
@@ -2703,6 +3835,100 @@ static int try_enable_new_console(struct console *newcon, bool user_specified)
return -ENOENT;
}
+/* Try to enable the console unconditionally */
+static void try_enable_default_console(struct console *newcon)
+{
+ if (newcon->index < 0)
+ newcon->index = 0;
+
+ if (console_call_setup(newcon, NULL) != 0)
+ return;
+
+ newcon->flags |= CON_ENABLED;
+
+ if (newcon->device)
+ newcon->flags |= CON_CONSDEV;
+}
+
+/* Return the starting sequence number for a newly registered console. */
+static u64 get_init_console_seq(struct console *newcon, bool bootcon_registered)
+{
+ struct console *con;
+ bool handover;
+ u64 init_seq;
+
+ if (newcon->flags & (CON_PRINTBUFFER | CON_BOOT)) {
+ /* Get a consistent copy of @syslog_seq. */
+ mutex_lock(&syslog_lock);
+ init_seq = syslog_seq;
+ mutex_unlock(&syslog_lock);
+ } else {
+ /* Begin with next message added to ringbuffer. */
+ init_seq = prb_next_seq(prb);
+
+ /*
+ * If any enabled boot consoles are due to be unregistered
+ * shortly, some may not be caught up and may be the same
+ * device as @newcon. Since it is not known which boot console
+ * is the same device, flush all consoles and, if necessary,
+ * start with the message of the enabled boot console that is
+ * the furthest behind.
+ */
+ if (bootcon_registered && !keep_bootcon) {
+ /*
+ * Hold the console_lock to stop console printing and
+ * guarantee safe access to console->seq.
+ */
+ console_lock();
+
+ /*
+ * Flush all consoles and set the console to start at
+ * the next unprinted sequence number.
+ */
+ if (!console_flush_all(true, &init_seq, &handover)) {
+ /*
+ * Flushing failed. Just choose the lowest
+ * sequence of the enabled boot consoles.
+ */
+
+ /*
+ * If there was a handover, this context no
+ * longer holds the console_lock.
+ */
+ if (handover)
+ console_lock();
+
+ init_seq = prb_next_seq(prb);
+ for_each_console(con) {
+ u64 seq;
+
+ if (!(con->flags & CON_BOOT) ||
+ !(con->flags & CON_ENABLED)) {
+ continue;
+ }
+
+ if (con->flags & CON_NBCON)
+ seq = nbcon_seq_read(con);
+ else
+ seq = con->seq;
+
+ if (seq < init_seq)
+ init_seq = seq;
+ }
+ }
+
+ console_unlock();
+ }
+ }
+
+ return init_seq;
+}
+
+#define console_first() \
+ hlist_entry(console_list.first, struct console, node)
+
+static int unregister_console_locked(struct console *console);
+
/*
* The console driver calls this routine during kernel initialization
* to register the console printing procedure with printk() and to
@@ -2724,64 +3950,76 @@ static int try_enable_new_console(struct console *newcon, bool user_specified)
*/
void register_console(struct console *newcon)
{
+ bool use_device_lock = (newcon->flags & CON_NBCON) && newcon->write_atomic;
+ bool bootcon_registered = false;
+ bool realcon_registered = false;
+ struct console *con;
unsigned long flags;
- struct console *bcon = NULL;
+ u64 init_seq;
int err;
- for_each_console(bcon) {
- if (WARN(bcon == newcon, "console '%s%d' already registered\n",
- bcon->name, bcon->index))
- return;
- }
+ console_list_lock();
- /*
- * before we register a new CON_BOOT console, make sure we don't
- * already have a valid console
- */
- if (newcon->flags & CON_BOOT) {
- for_each_console(bcon) {
- if (!(bcon->flags & CON_BOOT)) {
- pr_info("Too late to register bootconsole %s%d\n",
- newcon->name, newcon->index);
- return;
- }
+ for_each_console(con) {
+ if (WARN(con == newcon, "console '%s%d' already registered\n",
+ con->name, con->index)) {
+ goto unlock;
}
+
+ if (con->flags & CON_BOOT)
+ bootcon_registered = true;
+ else
+ realcon_registered = true;
}
- if (console_drivers && console_drivers->flags & CON_BOOT)
- bcon = console_drivers;
+ /* Do not register boot consoles when there already is a real one. */
+ if ((newcon->flags & CON_BOOT) && realcon_registered) {
+ pr_info("Too late to register bootconsole %s%d\n",
+ newcon->name, newcon->index);
+ goto unlock;
+ }
- if (!has_preferred_console || bcon || !console_drivers)
- has_preferred_console = preferred_console >= 0;
+ if (newcon->flags & CON_NBCON) {
+ /*
+ * Ensure the nbcon console buffers can be allocated
+ * before modifying any global data.
+ */
+ if (!nbcon_alloc(newcon))
+ goto unlock;
+ }
/*
- * See if we want to use this console driver. If we
- * didn't select a console we take the first one
- * that registers here.
+ * See if we want to enable this console driver by default.
+ *
+ * Nope when a console is preferred by the command line, device
+ * tree, or SPCR.
+ *
+ * The first real console with tty binding (driver) wins. More
+ * consoles might get enabled before the right one is found.
+ *
+ * Note that a console with tty binding will have CON_CONSDEV
+ * flag set and will be first in the list.
*/
- if (!has_preferred_console) {
- if (newcon->index < 0)
- newcon->index = 0;
- if (newcon->setup == NULL ||
- newcon->setup(newcon, NULL) == 0) {
- newcon->flags |= CON_ENABLED;
- if (newcon->device) {
- newcon->flags |= CON_CONSDEV;
- has_preferred_console = true;
- }
+ if (preferred_console < 0) {
+ if (hlist_empty(&console_list) || !console_first()->device ||
+ console_first()->flags & CON_BOOT) {
+ try_enable_default_console(newcon);
}
}
/* See if this console matches one we selected on the command line */
- err = try_enable_new_console(newcon, true);
+ err = try_enable_preferred_console(newcon, true);
/* If not, try to match against the platform default(s) */
if (err == -ENOENT)
- err = try_enable_new_console(newcon, false);
+ err = try_enable_preferred_console(newcon, false);
/* printk() messages are not printed to the Braille console. */
- if (err || newcon->flags & CON_BRL)
- return;
+ if (err || newcon->flags & CON_BRL) {
+ if (newcon->flags & CON_NBCON)
+ nbcon_free(newcon);
+ goto unlock;
+ }
/*
* If we have a bootconsole, and are switching to a real console,
@@ -2789,51 +4027,66 @@ void register_console(struct console *newcon)
* the real console are the same physical device, it's annoying to
* see the beginning boot messages twice
*/
- if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV))
+ if (bootcon_registered &&
+ ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
newcon->flags &= ~CON_PRINTBUFFER;
+ }
+
+ newcon->dropped = 0;
+ init_seq = get_init_console_seq(newcon, bootcon_registered);
+
+ if (newcon->flags & CON_NBCON) {
+ have_nbcon_console = true;
+ nbcon_seq_force(newcon, init_seq);
+ } else {
+ have_legacy_console = true;
+ newcon->seq = init_seq;
+ }
+
+ if (newcon->flags & CON_BOOT)
+ have_boot_console = true;
/*
- * Put this console in the list - keep the
- * preferred driver at the head of the list.
+ * If another context is actively using the hardware of this new
+ * console, it will not be aware of the nbcon synchronization. This
+ * is a risk that two contexts could access the hardware
+ * simultaneously if this new console is used for atomic printing
+ * and the other context is still using the hardware.
+ *
+ * Use the driver synchronization to ensure that the hardware is not
+ * in use while this new console transitions to being registered.
*/
- console_lock();
- if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
- newcon->next = console_drivers;
- console_drivers = newcon;
- if (newcon->next)
- newcon->next->flags &= ~CON_CONSDEV;
- /* Ensure this flag is always set for the head of the list */
+ if (use_device_lock)
+ newcon->device_lock(newcon, &flags);
+
+ /*
+ * Put this console in the list - keep the
+ * preferred driver at the head of the list.
+ */
+ if (hlist_empty(&console_list)) {
+ /* Ensure CON_CONSDEV is always set for the head. */
newcon->flags |= CON_CONSDEV;
+ hlist_add_head_rcu(&newcon->node, &console_list);
+
+ } else if (newcon->flags & CON_CONSDEV) {
+ /* Only the new head can have CON_CONSDEV set. */
+ console_srcu_write_flags(console_first(), console_first()->flags & ~CON_CONSDEV);
+ hlist_add_head_rcu(&newcon->node, &console_list);
+
} else {
- newcon->next = console_drivers->next;
- console_drivers->next = newcon;
+ hlist_add_behind_rcu(&newcon->node, console_list.first);
}
- if (newcon->flags & CON_EXTENDED)
- nr_ext_console_drivers++;
+ /*
+ * No need to synchronize SRCU here! The caller does not rely
+ * on all contexts being able to see the new console before
+ * register_console() completes.
+ */
+
+ /* This new console is now registered. */
+ if (use_device_lock)
+ newcon->device_unlock(newcon, flags);
- if (newcon->flags & CON_PRINTBUFFER) {
- /*
- * console_unlock(); will print out the buffered messages
- * for us.
- */
- logbuf_lock_irqsave(flags);
- /*
- * We're about to replay the log buffer. Only do this to the
- * just-registered console to avoid excessive message spam to
- * the already-registered consoles.
- *
- * Set exclusive_console with disabled interrupts to reduce
- * race window with eventual console_flush_on_panic() that
- * ignores console_lock.
- */
- exclusive_console = newcon;
- exclusive_console_stop_seq = console_seq;
- console_seq = syslog_seq;
- console_idx = syslog_idx;
- logbuf_unlock_irqrestore(flags);
- }
- console_unlock();
console_sysfs_notify();
/*
@@ -2843,30 +4096,39 @@ void register_console(struct console *newcon)
* users know there might be something in the kernel's log buffer that
* went to the bootconsole (that they do not see on the real console)
*/
- pr_info("%sconsole [%s%d] enabled\n",
- (newcon->flags & CON_BOOT) ? "boot" : "" ,
- newcon->name, newcon->index);
- if (bcon &&
+ con_printk(KERN_INFO, newcon, "enabled\n");
+ if (bootcon_registered &&
((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
!keep_bootcon) {
- /* We need to iterate through all boot consoles, to make
- * sure we print everything out, before we unregister them.
- */
- for_each_console(bcon)
- if (bcon->flags & CON_BOOT)
- unregister_console(bcon);
+ struct hlist_node *tmp;
+
+ hlist_for_each_entry_safe(con, tmp, &console_list, node) {
+ if (con->flags & CON_BOOT)
+ unregister_console_locked(con);
+ }
}
+
+ /* Changed console list, may require printer threads to start/stop. */
+ printk_kthreads_check_locked();
+unlock:
+ console_list_unlock();
}
EXPORT_SYMBOL(register_console);
-int unregister_console(struct console *console)
+/* Must be called under console_list_lock(). */
+static int unregister_console_locked(struct console *console)
{
- struct console *con;
+ bool use_device_lock = (console->flags & CON_NBCON) && console->write_atomic;
+ bool found_legacy_con = false;
+ bool found_nbcon_con = false;
+ bool found_boot_con = false;
+ unsigned long flags;
+ struct console *c;
int res;
- pr_info("%sconsole [%s%d] disabled\n",
- (console->flags & CON_BOOT) ? "boot" : "" ,
- console->name, console->index);
+ lockdep_assert_console_list_lock_held();
+
+ con_printk(KERN_INFO, console, "disabled\n");
res = _braille_unregister_console(console);
if (res < 0)
@@ -2874,51 +4136,135 @@ int unregister_console(struct console *console)
if (res > 0)
return 0;
- res = -ENODEV;
- console_lock();
- if (console_drivers == console) {
- console_drivers=console->next;
- res = 0;
- } else {
- for_each_console(con) {
- if (con->next == console) {
- con->next = console->next;
- res = 0;
- break;
- }
- }
- }
+ if (!console_is_registered_locked(console))
+ res = -ENODEV;
+ else if (console_is_usable(console, console->flags, true))
+ __pr_flush(console, 1000, true);
+
+ /* Disable it unconditionally */
+ console_srcu_write_flags(console, console->flags & ~CON_ENABLED);
+
+ if (res < 0)
+ return res;
+
+ /*
+ * Use the driver synchronization to ensure that the hardware is not
+ * in use while this console transitions to being unregistered.
+ */
+ if (use_device_lock)
+ console->device_lock(console, &flags);
- if (res)
- goto out_disable_unlock;
+ hlist_del_init_rcu(&console->node);
- if (console->flags & CON_EXTENDED)
- nr_ext_console_drivers--;
+ if (use_device_lock)
+ console->device_unlock(console, flags);
/*
+ * <HISTORICAL>
* If this isn't the last console and it has CON_CONSDEV set, we
* need to set it on the next preferred console.
+ * </HISTORICAL>
+ *
+ * The above makes no sense as there is no guarantee that the next
+ * console has any device attached. Oh well....
*/
- if (console_drivers != NULL && console->flags & CON_CONSDEV)
- console_drivers->flags |= CON_CONSDEV;
+ if (!hlist_empty(&console_list) && console->flags & CON_CONSDEV)
+ console_srcu_write_flags(console_first(), console_first()->flags | CON_CONSDEV);
+
+ /*
+ * Ensure that all SRCU list walks have completed. All contexts
+ * must not be able to see this console in the list so that any
+ * exit/cleanup routines can be performed safely.
+ */
+ synchronize_srcu(&console_srcu);
+
+ if (console->flags & CON_NBCON)
+ nbcon_free(console);
- console->flags &= ~CON_ENABLED;
- console_unlock();
console_sysfs_notify();
if (console->exit)
res = console->exit(console);
+ /*
+ * With this console gone, the global flags tracking registered
+ * console types may have changed. Update them.
+ */
+ for_each_console(c) {
+ if (c->flags & CON_BOOT)
+ found_boot_con = true;
+
+ if (c->flags & CON_NBCON)
+ found_nbcon_con = true;
+ else
+ found_legacy_con = true;
+ }
+ if (!found_boot_con)
+ have_boot_console = found_boot_con;
+ if (!found_legacy_con)
+ have_legacy_console = found_legacy_con;
+ if (!found_nbcon_con)
+ have_nbcon_console = found_nbcon_con;
+
+ /* Changed console list, may require printer threads to start/stop. */
+ printk_kthreads_check_locked();
+
return res;
+}
-out_disable_unlock:
- console->flags &= ~CON_ENABLED;
- console_unlock();
+int unregister_console(struct console *console)
+{
+ int res;
+ console_list_lock();
+ res = unregister_console_locked(console);
+ console_list_unlock();
return res;
}
EXPORT_SYMBOL(unregister_console);
+/**
+ * console_force_preferred_locked - force a registered console preferred
+ * @con: The registered console to force preferred.
+ *
+ * Must be called under console_list_lock().
+ */
+void console_force_preferred_locked(struct console *con)
+{
+ struct console *cur_pref_con;
+
+ if (!console_is_registered_locked(con))
+ return;
+
+ cur_pref_con = console_first();
+
+ /* Already preferred? */
+ if (cur_pref_con == con)
+ return;
+
+ /*
+ * Delete, but do not re-initialize the entry. This allows the console
+ * to continue to appear registered (via any hlist_unhashed_lockless()
+ * checks), even though it was briefly removed from the console list.
+ */
+ hlist_del_rcu(&con->node);
+
+ /*
+ * Ensure that all SRCU list walks have completed so that the console
+ * can be added to the beginning of the console list and its forward
+ * list pointer can be re-initialized.
+ */
+ synchronize_srcu(&console_srcu);
+
+ con->flags |= CON_CONSDEV;
+ WARN_ON(!con->device);
+
+ /* Only the new head can have CON_CONSDEV set. */
+ console_srcu_write_flags(cur_pref_con, cur_pref_con->flags & ~CON_CONSDEV);
+ hlist_add_head_rcu(&con->node, &console_list);
+}
+EXPORT_SYMBOL(console_force_preferred_locked);
+
/*
* Initialize the console device. This is called *early*, so
* we can't necessarily depend on lots of kernel help here.
@@ -2961,14 +4307,16 @@ void __init console_init(void)
*
* To mitigate this problem somewhat, only unregister consoles whose memory
* intersects with the init section. Note that all other boot consoles will
- * get unregistred when the real preferred console is registered.
+ * get unregistered when the real preferred console is registered.
*/
static int __init printk_late_init(void)
{
+ struct hlist_node *tmp;
struct console *con;
int ret;
- for_each_console(con) {
+ console_list_lock();
+ hlist_for_each_entry_safe(con, tmp, &console_list, node) {
if (!(con->flags & CON_BOOT))
continue;
@@ -2985,20 +4333,144 @@ static int __init printk_late_init(void)
*/
pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n",
con->name, con->index);
- unregister_console(con);
+ unregister_console_locked(con);
}
}
+ console_list_unlock();
+
ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL,
console_cpu_notify);
WARN_ON(ret < 0);
ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online",
console_cpu_notify, NULL);
WARN_ON(ret < 0);
+ printk_sysctl_init();
return 0;
}
late_initcall(printk_late_init);
#if defined CONFIG_PRINTK
+/* If @con is specified, only wait for that console. Otherwise wait for all. */
+static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress)
+{
+ unsigned long timeout_jiffies = msecs_to_jiffies(timeout_ms);
+ unsigned long remaining_jiffies = timeout_jiffies;
+ struct console_flush_type ft;
+ struct console *c;
+ u64 last_diff = 0;
+ u64 printk_seq;
+ short flags;
+ int cookie;
+ u64 diff;
+ u64 seq;
+
+ /* Sorry, pr_flush() will not work this early. */
+ if (system_state < SYSTEM_SCHEDULING)
+ return false;
+
+ might_sleep();
+
+ seq = prb_next_reserve_seq(prb);
+
+ /* Flush the consoles so that records up to @seq are printed. */
+ printk_get_console_flush_type(&ft);
+ if (ft.nbcon_atomic)
+ nbcon_atomic_flush_pending();
+ if (ft.legacy_direct) {
+ console_lock();
+ console_unlock();
+ }
+
+ for (;;) {
+ unsigned long begin_jiffies;
+ unsigned long slept_jiffies;
+
+ diff = 0;
+
+ /*
+ * Hold the console_lock to guarantee safe access to
+ * console->seq. Releasing console_lock flushes more
+ * records in case @seq is still not printed on all
+ * usable consoles.
+ *
+ * Holding the console_lock is not necessary if there
+ * are no legacy or boot consoles. However, such a
+ * console could register at any time. Always hold the
+ * console_lock as a precaution rather than
+ * synchronizing against register_console().
+ */
+ console_lock();
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ if (con && con != c)
+ continue;
+
+ flags = console_srcu_read_flags(c);
+
+ /*
+ * If consoles are not usable, it cannot be expected
+ * that they make forward progress, so only increment
+ * @diff for usable consoles.
+ */
+ if (!console_is_usable(c, flags, true) &&
+ !console_is_usable(c, flags, false)) {
+ continue;
+ }
+
+ if (flags & CON_NBCON) {
+ printk_seq = nbcon_seq_read(c);
+ } else {
+ printk_seq = c->seq;
+ }
+
+ if (printk_seq < seq)
+ diff += seq - printk_seq;
+ }
+ console_srcu_read_unlock(cookie);
+
+ if (diff != last_diff && reset_on_progress)
+ remaining_jiffies = timeout_jiffies;
+
+ console_unlock();
+
+ /* Note: @diff is 0 if there are no usable consoles. */
+ if (diff == 0 || remaining_jiffies == 0)
+ break;
+
+ /* msleep(1) might sleep much longer. Check time by jiffies. */
+ begin_jiffies = jiffies;
+ msleep(1);
+ slept_jiffies = jiffies - begin_jiffies;
+
+ remaining_jiffies -= min(slept_jiffies, remaining_jiffies);
+
+ last_diff = diff;
+ }
+
+ return (diff == 0);
+}
+
+/**
+ * pr_flush() - Wait for printing threads to catch up.
+ *
+ * @timeout_ms: The maximum time (in ms) to wait.
+ * @reset_on_progress: Reset the timeout if forward progress is seen.
+ *
+ * A value of 0 for @timeout_ms means no waiting will occur. A value of -1
+ * represents infinite waiting.
+ *
+ * If @reset_on_progress is true, the timeout will be reset whenever any
+ * printer has been seen to make some forward progress.
+ *
+ * Context: Process context. May sleep while acquiring console lock.
+ * Return: true if all usable printers are caught up.
+ */
+static bool pr_flush(int timeout_ms, bool reset_on_progress)
+{
+ return __pr_flush(NULL, timeout_ms, reset_on_progress);
+}
+
/*
* Delayed printk version, for scheduler-internal messages:
*/
@@ -3009,58 +4481,97 @@ static DEFINE_PER_CPU(int, printk_pending);
static void wake_up_klogd_work_func(struct irq_work *irq_work)
{
- int pending = __this_cpu_xchg(printk_pending, 0);
+ int pending = this_cpu_xchg(printk_pending, 0);
if (pending & PRINTK_PENDING_OUTPUT) {
- /* If trylock fails, someone else is doing the printing */
- if (console_trylock())
- console_unlock();
+ if (force_legacy_kthread()) {
+ if (printk_legacy_kthread)
+ wake_up_interruptible(&legacy_wait);
+ } else {
+ if (console_trylock())
+ console_unlock();
+ }
}
if (pending & PRINTK_PENDING_WAKEUP)
wake_up_interruptible(&log_wait);
}
-static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
- .func = wake_up_klogd_work_func,
- .flags = ATOMIC_INIT(IRQ_WORK_LAZY),
-};
+static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) =
+ IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func);
-void wake_up_klogd(void)
+static void __wake_up_klogd(int val)
{
if (!printk_percpu_data_ready())
return;
preempt_disable();
- if (waitqueue_active(&log_wait)) {
- this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
+ /*
+ * Guarantee any new records can be seen by tasks preparing to wait
+ * before this context checks if the wait queue is empty.
+ *
+ * The full memory barrier within wq_has_sleeper() pairs with the full
+ * memory barrier within set_current_state() of
+ * prepare_to_wait_event(), which is called after ___wait_event() adds
+ * the waiter but before it has checked the wait condition.
+ *
+ * This pairs with devkmsg_read:A and syslog_print:A.
+ */
+ if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */
+ (val & PRINTK_PENDING_OUTPUT)) {
+ this_cpu_or(printk_pending, val);
irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
}
preempt_enable();
}
-void defer_console_output(void)
+/**
+ * wake_up_klogd - Wake kernel logging daemon
+ *
+ * Use this function when new records have been added to the ringbuffer
+ * and the console printing of those records has already occurred or is
+ * known to be handled by some other context. This function will only
+ * wake the logging daemon.
+ *
+ * Context: Any context.
+ */
+void wake_up_klogd(void)
{
- if (!printk_percpu_data_ready())
- return;
-
- preempt_disable();
- __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
- irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
- preempt_enable();
+ __wake_up_klogd(PRINTK_PENDING_WAKEUP);
}
-int vprintk_deferred(const char *fmt, va_list args)
+/**
+ * defer_console_output - Wake kernel logging daemon and trigger
+ * console printing in a deferred context
+ *
+ * Use this function when new records have been added to the ringbuffer,
+ * this context is responsible for console printing those records, but
+ * the current context is not allowed to perform the console printing.
+ * Trigger an irq_work context to perform the console printing. This
+ * function also wakes the logging daemon.
+ *
+ * Context: Any context.
+ */
+void defer_console_output(void)
{
- int r;
+ /*
+ * New messages may have been added directly to the ringbuffer
+ * using vprintk_store(), so wake any waiters as well.
+ */
+ __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT);
+}
- r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
+void printk_trigger_flush(void)
+{
defer_console_output();
+}
- return r;
+int vprintk_deferred(const char *fmt, va_list args)
+{
+ return vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args);
}
-int printk_deferred(const char *fmt, ...)
+int _printk_deferred(const char *fmt, ...)
{
va_list args;
int r;
@@ -3187,17 +4698,21 @@ const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason)
EXPORT_SYMBOL_GPL(kmsg_dump_reason_str);
/**
- * kmsg_dump - dump kernel log to kernel message dumpers.
+ * kmsg_dump_desc - dump kernel log to kernel message dumpers.
* @reason: the reason (oops, panic etc) for dumping
+ * @desc: a short string to describe what caused the panic or oops. Can be NULL
+ * if no additional description is available.
*
* Call each of the registered dumper's dump() callback, which can
* retrieve the kmsg records with kmsg_dump_get_line() or
* kmsg_dump_get_buffer().
*/
-void kmsg_dump(enum kmsg_dump_reason reason)
+void kmsg_dump_desc(enum kmsg_dump_reason reason, const char *desc)
{
struct kmsg_dumper *dumper;
- unsigned long flags;
+ struct kmsg_dump_detail detail = {
+ .reason = reason,
+ .description = desc};
rcu_read_lock();
list_for_each_entry_rcu(dumper, &dump_list, list) {
@@ -3214,28 +4729,15 @@ void kmsg_dump(enum kmsg_dump_reason reason)
if (reason > max_reason)
continue;
- /* initialize iterator with data about the stored records */
- dumper->active = true;
-
- logbuf_lock_irqsave(flags);
- dumper->cur_seq = clear_seq;
- dumper->cur_idx = clear_idx;
- dumper->next_seq = log_next_seq;
- dumper->next_idx = log_next_idx;
- logbuf_unlock_irqrestore(flags);
-
/* invoke dumper which will iterate over records */
- dumper->dump(dumper, reason);
-
- /* reset iterator */
- dumper->active = false;
+ dumper->dump(dumper, &detail);
}
rcu_read_unlock();
}
/**
- * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version)
- * @dumper: registered kmsg dumper
+ * kmsg_dump_get_line - retrieve one kmsg log line
+ * @iter: kmsg dump iterator
* @syslog: include the "<4>" prefixes
* @line: buffer to copy the line to
* @size: maximum size of the buffer
@@ -3249,82 +4751,56 @@ void kmsg_dump(enum kmsg_dump_reason reason)
*
* A return value of FALSE indicates that there are no more records to
* read.
- *
- * The function is similar to kmsg_dump_get_line(), but grabs no locks.
*/
-bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
- char *line, size_t size, size_t *len)
+bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog,
+ char *line, size_t size, size_t *len)
{
- struct printk_log *msg;
+ u64 min_seq = latched_seq_read_nolock(&clear_seq);
+ struct printk_info info;
+ unsigned int line_count;
+ struct printk_record r;
size_t l = 0;
bool ret = false;
- if (!dumper->active)
- goto out;
+ if (iter->cur_seq < min_seq)
+ iter->cur_seq = min_seq;
- if (dumper->cur_seq < log_first_seq) {
- /* messages are gone, move to first available one */
- dumper->cur_seq = log_first_seq;
- dumper->cur_idx = log_first_idx;
- }
+ prb_rec_init_rd(&r, &info, line, size);
- /* last entry */
- if (dumper->cur_seq >= log_next_seq)
- goto out;
+ /* Read text or count text lines? */
+ if (line) {
+ if (!prb_read_valid(prb, iter->cur_seq, &r))
+ goto out;
+ l = record_print_text(&r, syslog, printk_time);
+ } else {
+ if (!prb_read_valid_info(prb, iter->cur_seq,
+ &info, &line_count)) {
+ goto out;
+ }
+ l = get_record_print_text_size(&info, line_count, syslog,
+ printk_time);
- msg = log_from_idx(dumper->cur_idx);
- l = msg_print_text(msg, syslog, printk_time, line, size);
+ }
- dumper->cur_idx = log_next(dumper->cur_idx);
- dumper->cur_seq++;
+ iter->cur_seq = r.info->seq + 1;
ret = true;
out:
if (len)
*len = l;
return ret;
}
-
-/**
- * kmsg_dump_get_line - retrieve one kmsg log line
- * @dumper: registered kmsg dumper
- * @syslog: include the "<4>" prefixes
- * @line: buffer to copy the line to
- * @size: maximum size of the buffer
- * @len: length of line placed into buffer
- *
- * Start at the beginning of the kmsg buffer, with the oldest kmsg
- * record, and copy one record into the provided buffer.
- *
- * Consecutive calls will return the next available record moving
- * towards the end of the buffer with the youngest messages.
- *
- * A return value of FALSE indicates that there are no more records to
- * read.
- */
-bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
- char *line, size_t size, size_t *len)
-{
- unsigned long flags;
- bool ret;
-
- logbuf_lock_irqsave(flags);
- ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
- logbuf_unlock_irqrestore(flags);
-
- return ret;
-}
EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
/**
* kmsg_dump_get_buffer - copy kmsg log lines
- * @dumper: registered kmsg dumper
+ * @iter: kmsg dump iterator
* @syslog: include the "<4>" prefixes
* @buf: buffer to copy the line to
* @size: maximum size of the buffer
- * @len: length of line placed into buffer
+ * @len_out: length of line placed into buffer
*
* Start at the end of the kmsg buffer and fill the provided buffer
- * with as many of the the *youngest* kmsg records that fit into it.
+ * with as many of the *youngest* kmsg records that fit into it.
* If the buffer is large enough, all available kmsg records will be
* copied with a single call.
*
@@ -3334,114 +4810,232 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
* A return value of FALSE indicates that there are no more records to
* read.
*/
-bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
- char *buf, size_t size, size_t *len)
+bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
+ char *buf, size_t size, size_t *len_out)
{
- unsigned long flags;
+ u64 min_seq = latched_seq_read_nolock(&clear_seq);
+ struct printk_info info;
+ struct printk_record r;
u64 seq;
- u32 idx;
u64 next_seq;
- u32 next_idx;
- size_t l = 0;
+ size_t len = 0;
bool ret = false;
bool time = printk_time;
- if (!dumper->active)
+ if (!buf || !size)
goto out;
- logbuf_lock_irqsave(flags);
- if (dumper->cur_seq < log_first_seq) {
- /* messages are gone, move to first available one */
- dumper->cur_seq = log_first_seq;
- dumper->cur_idx = log_first_idx;
+ if (iter->cur_seq < min_seq)
+ iter->cur_seq = min_seq;
+
+ if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) {
+ if (info.seq != iter->cur_seq) {
+ /* messages are gone, move to first available one */
+ iter->cur_seq = info.seq;
+ }
}
/* last entry */
- if (dumper->cur_seq >= dumper->next_seq) {
- logbuf_unlock_irqrestore(flags);
+ if (iter->cur_seq >= iter->next_seq)
goto out;
- }
- /* calculate length of entire buffer */
- seq = dumper->cur_seq;
- idx = dumper->cur_idx;
- while (seq < dumper->next_seq) {
- struct printk_log *msg = log_from_idx(idx);
+ /*
+ * Find first record that fits, including all following records,
+ * into the user-provided buffer for this dump. Pass in size-1
+ * because this function (by way of record_print_text()) will
+ * not write more than size-1 bytes of text into @buf.
+ */
+ seq = find_first_fitting_seq(iter->cur_seq, iter->next_seq,
+ size - 1, syslog, time);
- l += msg_print_text(msg, true, time, NULL, 0);
- idx = log_next(idx);
- seq++;
- }
+ /*
+ * Next kmsg_dump_get_buffer() invocation will dump block of
+ * older records stored right before this one.
+ */
+ next_seq = seq;
- /* move first record forward until length fits into the buffer */
- seq = dumper->cur_seq;
- idx = dumper->cur_idx;
- while (l >= size && seq < dumper->next_seq) {
- struct printk_log *msg = log_from_idx(idx);
+ prb_rec_init_rd(&r, &info, buf, size);
- l -= msg_print_text(msg, true, time, NULL, 0);
- idx = log_next(idx);
- seq++;
- }
-
- /* last message in next interation */
- next_seq = seq;
- next_idx = idx;
+ prb_for_each_record(seq, prb, seq, &r) {
+ if (r.info->seq >= iter->next_seq)
+ break;
- l = 0;
- while (seq < dumper->next_seq) {
- struct printk_log *msg = log_from_idx(idx);
+ len += record_print_text(&r, syslog, time);
- l += msg_print_text(msg, syslog, time, buf + l, size - l);
- idx = log_next(idx);
- seq++;
+ /* Adjust record to store to remaining buffer space. */
+ prb_rec_init_rd(&r, &info, buf + len, size - len);
}
- dumper->next_seq = next_seq;
- dumper->next_idx = next_idx;
+ iter->next_seq = next_seq;
ret = true;
- logbuf_unlock_irqrestore(flags);
out:
- if (len)
- *len = l;
+ if (len_out)
+ *len_out = len;
return ret;
}
EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
/**
- * kmsg_dump_rewind_nolock - reset the iterator (unlocked version)
- * @dumper: registered kmsg dumper
+ * kmsg_dump_rewind - reset the iterator
+ * @iter: kmsg dump iterator
*
* Reset the dumper's iterator so that kmsg_dump_get_line() and
* kmsg_dump_get_buffer() can be called again and used multiple
* times within the same dumper.dump() callback.
+ */
+void kmsg_dump_rewind(struct kmsg_dump_iter *iter)
+{
+ iter->cur_seq = latched_seq_read_nolock(&clear_seq);
+ iter->next_seq = prb_next_seq(prb);
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
+
+/**
+ * console_try_replay_all - try to replay kernel log on consoles
+ *
+ * Try to obtain lock on console subsystem and replay all
+ * available records in printk buffer on the consoles.
+ * Does nothing if lock is not obtained.
*
- * The function is similar to kmsg_dump_rewind(), but grabs no locks.
+ * Context: Any, except for NMI.
*/
-void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
+void console_try_replay_all(void)
+{
+ struct console_flush_type ft;
+
+ printk_get_console_flush_type(&ft);
+ if (console_trylock()) {
+ __console_rewind_all();
+ if (ft.nbcon_atomic)
+ nbcon_atomic_flush_pending();
+ if (ft.nbcon_offload)
+ nbcon_kthreads_wake();
+ if (ft.legacy_offload)
+ defer_console_output();
+ /* Consoles are flushed as part of console_unlock(). */
+ console_unlock();
+ }
+}
+#endif
+
+#ifdef CONFIG_SMP
+static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1);
+static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0);
+
+bool is_printk_cpu_sync_owner(void)
{
- dumper->cur_seq = clear_seq;
- dumper->cur_idx = clear_idx;
- dumper->next_seq = log_next_seq;
- dumper->next_idx = log_next_idx;
+ return (atomic_read(&printk_cpu_sync_owner) == raw_smp_processor_id());
}
/**
- * kmsg_dump_rewind - reset the iterator
- * @dumper: registered kmsg dumper
+ * __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant
+ * spinning lock is not owned by any CPU.
*
- * Reset the dumper's iterator so that kmsg_dump_get_line() and
- * kmsg_dump_get_buffer() can be called again and used multiple
- * times within the same dumper.dump() callback.
+ * Context: Any context.
*/
-void kmsg_dump_rewind(struct kmsg_dumper *dumper)
+void __printk_cpu_sync_wait(void)
{
- unsigned long flags;
+ do {
+ cpu_relax();
+ } while (atomic_read(&printk_cpu_sync_owner) != -1);
+}
+EXPORT_SYMBOL(__printk_cpu_sync_wait);
- logbuf_lock_irqsave(flags);
- kmsg_dump_rewind_nolock(dumper);
- logbuf_unlock_irqrestore(flags);
+/**
+ * __printk_cpu_sync_try_get() - Try to acquire the printk cpu-reentrant
+ * spinning lock.
+ *
+ * If no processor has the lock, the calling processor takes the lock and
+ * becomes the owner. If the calling processor is already the owner of the
+ * lock, this function succeeds immediately.
+ *
+ * Context: Any context. Expects interrupts to be disabled.
+ * Return: 1 on success, otherwise 0.
+ */
+int __printk_cpu_sync_try_get(void)
+{
+ int cpu;
+ int old;
+
+ cpu = smp_processor_id();
+
+ /*
+ * Guarantee loads and stores from this CPU when it is the lock owner
+ * are _not_ visible to the previous lock owner. This pairs with
+ * __printk_cpu_sync_put:B.
+ *
+ * Memory barrier involvement:
+ *
+ * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B,
+ * then __printk_cpu_sync_put:A can never read from
+ * __printk_cpu_sync_try_get:B.
+ *
+ * Relies on:
+ *
+ * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B
+ * of the previous CPU
+ * matching
+ * ACQUIRE from __printk_cpu_sync_try_get:A to
+ * __printk_cpu_sync_try_get:B of this CPU
+ */
+ old = atomic_cmpxchg_acquire(&printk_cpu_sync_owner, -1,
+ cpu); /* LMM(__printk_cpu_sync_try_get:A) */
+ if (old == -1) {
+ /*
+ * This CPU is now the owner and begins loading/storing
+ * data: LMM(__printk_cpu_sync_try_get:B)
+ */
+ return 1;
+
+ } else if (old == cpu) {
+ /* This CPU is already the owner. */
+ atomic_inc(&printk_cpu_sync_nested);
+ return 1;
+ }
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
+EXPORT_SYMBOL(__printk_cpu_sync_try_get);
-#endif
+/**
+ * __printk_cpu_sync_put() - Release the printk cpu-reentrant spinning lock.
+ *
+ * The calling processor must be the owner of the lock.
+ *
+ * Context: Any context. Expects interrupts to be disabled.
+ */
+void __printk_cpu_sync_put(void)
+{
+ if (atomic_read(&printk_cpu_sync_nested)) {
+ atomic_dec(&printk_cpu_sync_nested);
+ return;
+ }
+
+ /*
+ * This CPU is finished loading/storing data:
+ * LMM(__printk_cpu_sync_put:A)
+ */
+
+ /*
+ * Guarantee loads and stores from this CPU when it was the
+ * lock owner are visible to the next lock owner. This pairs
+ * with __printk_cpu_sync_try_get:A.
+ *
+ * Memory barrier involvement:
+ *
+ * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B,
+ * then __printk_cpu_sync_try_get:B reads from __printk_cpu_sync_put:A.
+ *
+ * Relies on:
+ *
+ * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B
+ * of this CPU
+ * matching
+ * ACQUIRE from __printk_cpu_sync_try_get:A to
+ * __printk_cpu_sync_try_get:B of the next CPU
+ */
+ atomic_set_release(&printk_cpu_sync_owner,
+ -1); /* LMM(__printk_cpu_sync_put:B) */
+}
+EXPORT_SYMBOL(__printk_cpu_sync_put);
+#endif /* CONFIG_SMP */
diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
new file mode 100644
index 000000000000..88e8f3a61922
--- /dev/null
+++ b/kernel/printk/printk_ringbuffer.c
@@ -0,0 +1,2353 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/kernel.h>
+#include <linux/irqflags.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/bug.h>
+#include "printk_ringbuffer.h"
+#include "internal.h"
+
+/**
+ * DOC: printk_ringbuffer overview
+ *
+ * Data Structure
+ * --------------
+ * The printk_ringbuffer is made up of 3 internal ringbuffers:
+ *
+ * desc_ring
+ * A ring of descriptors and their meta data (such as sequence number,
+ * timestamp, loglevel, etc.) as well as internal state information about
+ * the record and logical positions specifying where in the other
+ * ringbuffer the text strings are located.
+ *
+ * text_data_ring
+ * A ring of data blocks. A data block consists of an unsigned long
+ * integer (ID) that maps to a desc_ring index followed by the text
+ * string of the record.
+ *
+ * The internal state information of a descriptor is the key element to allow
+ * readers and writers to locklessly synchronize access to the data.
+ *
+ * Implementation
+ * --------------
+ *
+ * Descriptor Ring
+ * ~~~~~~~~~~~~~~~
+ * The descriptor ring is an array of descriptors. A descriptor contains
+ * essential meta data to track the data of a printk record using
+ * blk_lpos structs pointing to associated text data blocks (see
+ * "Data Rings" below). Each descriptor is assigned an ID that maps
+ * directly to index values of the descriptor array and has a state. The ID
+ * and the state are bitwise combined into a single descriptor field named
+ * @state_var, allowing ID and state to be synchronously and atomically
+ * updated.
+ *
+ * Descriptors have four states:
+ *
+ * reserved
+ * A writer is modifying the record.
+ *
+ * committed
+ * The record and all its data are written. A writer can reopen the
+ * descriptor (transitioning it back to reserved), but in the committed
+ * state the data is consistent.
+ *
+ * finalized
+ * The record and all its data are complete and available for reading. A
+ * writer cannot reopen the descriptor.
+ *
+ * reusable
+ * The record exists, but its text and/or meta data may no longer be
+ * available.
+ *
+ * Querying the @state_var of a record requires providing the ID of the
+ * descriptor to query. This can yield a possible fifth (pseudo) state:
+ *
+ * miss
+ * The descriptor being queried has an unexpected ID.
+ *
+ * The descriptor ring has a @tail_id that contains the ID of the oldest
+ * descriptor and @head_id that contains the ID of the newest descriptor.
+ *
+ * When a new descriptor should be created (and the ring is full), the tail
+ * descriptor is invalidated by first transitioning to the reusable state and
+ * then invalidating all tail data blocks up to and including the data blocks
+ * associated with the tail descriptor (for the text ring). Then
+ * @tail_id is advanced, followed by advancing @head_id. And finally the
+ * @state_var of the new descriptor is initialized to the new ID and reserved
+ * state.
+ *
+ * The @tail_id can only be advanced if the new @tail_id would be in the
+ * committed or reusable queried state. This makes it possible that a valid
+ * sequence number of the tail is always available.
+ *
+ * Descriptor Finalization
+ * ~~~~~~~~~~~~~~~~~~~~~~~
+ * When a writer calls the commit function prb_commit(), record data is
+ * fully stored and is consistent within the ringbuffer. However, a writer can
+ * reopen that record, claiming exclusive access (as with prb_reserve()), and
+ * modify that record. When finished, the writer must again commit the record.
+ *
+ * In order for a record to be made available to readers (and also become
+ * recyclable for writers), it must be finalized. A finalized record cannot be
+ * reopened and can never become "unfinalized". Record finalization can occur
+ * in three different scenarios:
+ *
+ * 1) A writer can simultaneously commit and finalize its record by calling
+ * prb_final_commit() instead of prb_commit().
+ *
+ * 2) When a new record is reserved and the previous record has been
+ * committed via prb_commit(), that previous record is automatically
+ * finalized.
+ *
+ * 3) When a record is committed via prb_commit() and a newer record
+ * already exists, the record being committed is automatically finalized.
+ *
+ * Data Ring
+ * ~~~~~~~~~
+ * The text data ring is a byte array composed of data blocks. Data blocks are
+ * referenced by blk_lpos structs that point to the logical position of the
+ * beginning of a data block and the beginning of the next adjacent data
+ * block. Logical positions are mapped directly to index values of the byte
+ * array ringbuffer.
+ *
+ * Each data block consists of an ID followed by the writer data. The ID is
+ * the identifier of a descriptor that is associated with the data block. A
+ * given data block is considered valid if all of the following conditions
+ * are met:
+ *
+ * 1) The descriptor associated with the data block is in the committed
+ * or finalized queried state.
+ *
+ * 2) The blk_lpos struct within the descriptor associated with the data
+ * block references back to the same data block.
+ *
+ * 3) The data block is within the head/tail logical position range.
+ *
+ * If the writer data of a data block would extend beyond the end of the
+ * byte array, only the ID of the data block is stored at the logical
+ * position and the full data block (ID and writer data) is stored at the
+ * beginning of the byte array. The referencing blk_lpos will point to the
+ * ID before the wrap and the next data block will be at the logical
+ * position adjacent the full data block after the wrap.
+ *
+ * Data rings have a @tail_lpos that points to the beginning of the oldest
+ * data block and a @head_lpos that points to the logical position of the
+ * next (not yet existing) data block.
+ *
+ * When a new data block should be created (and the ring is full), tail data
+ * blocks will first be invalidated by putting their associated descriptors
+ * into the reusable state and then pushing the @tail_lpos forward beyond
+ * them. Then the @head_lpos is pushed forward and is associated with a new
+ * descriptor. If a data block is not valid, the @tail_lpos cannot be
+ * advanced beyond it.
+ *
+ * Info Array
+ * ~~~~~~~~~~
+ * The general meta data of printk records are stored in printk_info structs,
+ * stored in an array with the same number of elements as the descriptor ring.
+ * Each info corresponds to the descriptor of the same index in the
+ * descriptor ring. Info validity is confirmed by evaluating the corresponding
+ * descriptor before and after loading the info.
+ *
+ * Usage
+ * -----
+ * Here are some simple examples demonstrating writers and readers. For the
+ * examples a global ringbuffer (test_rb) is available (which is not the
+ * actual ringbuffer used by printk)::
+ *
+ * DEFINE_PRINTKRB(test_rb, 15, 5);
+ *
+ * This ringbuffer allows up to 32768 records (2 ^ 15) and has a size of
+ * 1 MiB (2 ^ (15 + 5)) for text data.
+ *
+ * Sample writer code::
+ *
+ * const char *textstr = "message text";
+ * struct prb_reserved_entry e;
+ * struct printk_record r;
+ *
+ * // specify how much to allocate
+ * prb_rec_init_wr(&r, strlen(textstr) + 1);
+ *
+ * if (prb_reserve(&e, &test_rb, &r)) {
+ * snprintf(r.text_buf, r.text_buf_size, "%s", textstr);
+ *
+ * r.info->text_len = strlen(textstr);
+ * r.info->ts_nsec = local_clock();
+ * r.info->caller_id = printk_caller_id();
+ *
+ * // commit and finalize the record
+ * prb_final_commit(&e);
+ * }
+ *
+ * Note that additional writer functions are available to extend a record
+ * after it has been committed but not yet finalized. This can be done as
+ * long as no new records have been reserved and the caller is the same.
+ *
+ * Sample writer code (record extending)::
+ *
+ * // alternate rest of previous example
+ *
+ * r.info->text_len = strlen(textstr);
+ * r.info->ts_nsec = local_clock();
+ * r.info->caller_id = printk_caller_id();
+ *
+ * // commit the record (but do not finalize yet)
+ * prb_commit(&e);
+ * }
+ *
+ * ...
+ *
+ * // specify additional 5 bytes text space to extend
+ * prb_rec_init_wr(&r, 5);
+ *
+ * // try to extend, but only if it does not exceed 32 bytes
+ * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id(), 32)) {
+ * snprintf(&r.text_buf[r.info->text_len],
+ * r.text_buf_size - r.info->text_len, "hello");
+ *
+ * r.info->text_len += 5;
+ *
+ * // commit and finalize the record
+ * prb_final_commit(&e);
+ * }
+ *
+ * Sample reader code::
+ *
+ * struct printk_info info;
+ * struct printk_record r;
+ * char text_buf[32];
+ * u64 seq;
+ *
+ * prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf));
+ *
+ * prb_for_each_record(0, &test_rb, &seq, &r) {
+ * if (info.seq != seq)
+ * pr_warn("lost %llu records\n", info.seq - seq);
+ *
+ * if (info.text_len > r.text_buf_size) {
+ * pr_warn("record %llu text truncated\n", info.seq);
+ * text_buf[r.text_buf_size - 1] = 0;
+ * }
+ *
+ * pr_info("%llu: %llu: %s\n", info.seq, info.ts_nsec,
+ * &text_buf[0]);
+ * }
+ *
+ * Note that additional less convenient reader functions are available to
+ * allow complex record access.
+ *
+ * ABA Issues
+ * ~~~~~~~~~~
+ * To help avoid ABA issues, descriptors are referenced by IDs (array index
+ * values combined with tagged bits counting array wraps) and data blocks are
+ * referenced by logical positions (array index values combined with tagged
+ * bits counting array wraps). However, on 32-bit systems the number of
+ * tagged bits is relatively small such that an ABA incident is (at least
+ * theoretically) possible. For example, if 4 million maximally sized (1KiB)
+ * printk messages were to occur in NMI context on a 32-bit system, the
+ * interrupted context would not be able to recognize that the 32-bit integer
+ * completely wrapped and thus represents a different data block than the one
+ * the interrupted context expects.
+ *
+ * To help combat this possibility, additional state checking is performed
+ * (such as using cmpxchg() even though set() would suffice). These extra
+ * checks are commented as such and will hopefully catch any ABA issue that
+ * a 32-bit system might experience.
+ *
+ * Memory Barriers
+ * ~~~~~~~~~~~~~~~
+ * Multiple memory barriers are used. To simplify proving correctness and
+ * generating litmus tests, lines of code related to memory barriers
+ * (loads, stores, and the associated memory barriers) are labeled::
+ *
+ * LMM(function:letter)
+ *
+ * Comments reference the labels using only the "function:letter" part.
+ *
+ * The memory barrier pairs and their ordering are:
+ *
+ * desc_reserve:D / desc_reserve:B
+ * push descriptor tail (id), then push descriptor head (id)
+ *
+ * desc_reserve:D / data_push_tail:B
+ * push data tail (lpos), then set new descriptor reserved (state)
+ *
+ * desc_reserve:D / desc_push_tail:C
+ * push descriptor tail (id), then set new descriptor reserved (state)
+ *
+ * desc_reserve:D / prb_first_seq:C
+ * push descriptor tail (id), then set new descriptor reserved (state)
+ *
+ * desc_reserve:F / desc_read:D
+ * set new descriptor id and reserved (state), then allow writer changes
+ *
+ * data_alloc:A (or data_realloc:A) / desc_read:D
+ * set old descriptor reusable (state), then modify new data block area
+ *
+ * data_alloc:A (or data_realloc:A) / data_push_tail:B
+ * push data tail (lpos), then modify new data block area
+ *
+ * _prb_commit:B / desc_read:B
+ * store writer changes, then set new descriptor committed (state)
+ *
+ * desc_reopen_last:A / _prb_commit:B
+ * set descriptor reserved (state), then read descriptor data
+ *
+ * _prb_commit:B / desc_reserve:D
+ * set new descriptor committed (state), then check descriptor head (id)
+ *
+ * data_push_tail:D / data_push_tail:A
+ * set descriptor reusable (state), then push data tail (lpos)
+ *
+ * desc_push_tail:B / desc_reserve:D
+ * set descriptor reusable (state), then push descriptor tail (id)
+ *
+ * desc_update_last_finalized:A / desc_last_finalized_seq:A
+ * store finalized record, then set new highest finalized sequence number
+ */
+
+#define DATA_SIZE(data_ring) _DATA_SIZE((data_ring)->size_bits)
+#define DATA_SIZE_MASK(data_ring) (DATA_SIZE(data_ring) - 1)
+
+#define DESCS_COUNT(desc_ring) _DESCS_COUNT((desc_ring)->count_bits)
+#define DESCS_COUNT_MASK(desc_ring) (DESCS_COUNT(desc_ring) - 1)
+
+/* Determine the data array index from a logical position. */
+#define DATA_INDEX(data_ring, lpos) ((lpos) & DATA_SIZE_MASK(data_ring))
+
+/* Determine the desc array index from an ID or sequence number. */
+#define DESC_INDEX(desc_ring, n) ((n) & DESCS_COUNT_MASK(desc_ring))
+
+/* Determine how many times the data array has wrapped. */
+#define DATA_WRAPS(data_ring, lpos) ((lpos) >> (data_ring)->size_bits)
+
+/* Determine if a logical position refers to a data-less block. */
+#define LPOS_DATALESS(lpos) ((lpos) & 1UL)
+#define BLK_DATALESS(blk) (LPOS_DATALESS((blk)->begin) && \
+ LPOS_DATALESS((blk)->next))
+
+/* Get the logical position at index 0 of the current wrap. */
+#define DATA_THIS_WRAP_START_LPOS(data_ring, lpos) \
+((lpos) & ~DATA_SIZE_MASK(data_ring))
+
+/* Get the ID for the same index of the previous wrap as the given ID. */
+#define DESC_ID_PREV_WRAP(desc_ring, id) \
+DESC_ID((id) - DESCS_COUNT(desc_ring))
+
+/*
+ * A data block: mapped directly to the beginning of the data block area
+ * specified as a logical position within the data ring.
+ *
+ * @id: the ID of the associated descriptor
+ * @data: the writer data
+ *
+ * Note that the size of a data block is only known by its associated
+ * descriptor.
+ */
+struct prb_data_block {
+ unsigned long id;
+ char data[];
+};
+
+/*
+ * Return the descriptor associated with @n. @n can be either a
+ * descriptor ID or a sequence number.
+ */
+static struct prb_desc *to_desc(struct prb_desc_ring *desc_ring, u64 n)
+{
+ return &desc_ring->descs[DESC_INDEX(desc_ring, n)];
+}
+
+/*
+ * Return the printk_info associated with @n. @n can be either a
+ * descriptor ID or a sequence number.
+ */
+static struct printk_info *to_info(struct prb_desc_ring *desc_ring, u64 n)
+{
+ return &desc_ring->infos[DESC_INDEX(desc_ring, n)];
+}
+
+static struct prb_data_block *to_block(struct prb_data_ring *data_ring,
+ unsigned long begin_lpos)
+{
+ return (void *)&data_ring->data[DATA_INDEX(data_ring, begin_lpos)];
+}
+
+/*
+ * Increase the data size to account for data block meta data plus any
+ * padding so that the adjacent data block is aligned on the ID size.
+ */
+static unsigned int to_blk_size(unsigned int size)
+{
+ struct prb_data_block *db = NULL;
+
+ size += sizeof(*db);
+ size = ALIGN(size, sizeof(db->id));
+ return size;
+}
+
+/*
+ * Sanity checker for reserve size. The ringbuffer code assumes that a data
+ * block does not exceed the maximum possible size that could fit within the
+ * ringbuffer. This function provides that basic size check so that the
+ * assumption is safe.
+ */
+static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size)
+{
+ struct prb_data_block *db = NULL;
+
+ if (size == 0)
+ return true;
+
+ /*
+ * Ensure the alignment padded size could possibly fit in the data
+ * array. The largest possible data block must still leave room for
+ * at least the ID of the next block.
+ */
+ size = to_blk_size(size);
+ if (size > DATA_SIZE(data_ring) - sizeof(db->id))
+ return false;
+
+ return true;
+}
+
+/* Query the state of a descriptor. */
+static enum desc_state get_desc_state(unsigned long id,
+ unsigned long state_val)
+{
+ if (id != DESC_ID(state_val))
+ return desc_miss;
+
+ return DESC_STATE(state_val);
+}
+
+/*
+ * Get a copy of a specified descriptor and return its queried state. If the
+ * descriptor is in an inconsistent state (miss or reserved), the caller can
+ * only expect the descriptor's @state_var field to be valid.
+ *
+ * The sequence number and caller_id can be optionally retrieved. Like all
+ * non-state_var data, they are only valid if the descriptor is in a
+ * consistent state.
+ */
+static enum desc_state desc_read(struct prb_desc_ring *desc_ring,
+ unsigned long id, struct prb_desc *desc_out,
+ u64 *seq_out, u32 *caller_id_out)
+{
+ struct printk_info *info = to_info(desc_ring, id);
+ struct prb_desc *desc = to_desc(desc_ring, id);
+ atomic_long_t *state_var = &desc->state_var;
+ enum desc_state d_state;
+ unsigned long state_val;
+
+ /* Check the descriptor state. */
+ state_val = atomic_long_read(state_var); /* LMM(desc_read:A) */
+ d_state = get_desc_state(id, state_val);
+ if (d_state == desc_miss || d_state == desc_reserved) {
+ /*
+ * The descriptor is in an inconsistent state. Set at least
+ * @state_var so that the caller can see the details of
+ * the inconsistent state.
+ */
+ goto out;
+ }
+
+ /*
+ * Guarantee the state is loaded before copying the descriptor
+ * content. This avoids copying obsolete descriptor content that might
+ * not apply to the descriptor state. This pairs with _prb_commit:B.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_read:A reads from _prb_commit:B, then desc_read:C reads
+ * from _prb_commit:A.
+ *
+ * Relies on:
+ *
+ * WMB from _prb_commit:A to _prb_commit:B
+ * matching
+ * RMB from desc_read:A to desc_read:C
+ */
+ smp_rmb(); /* LMM(desc_read:B) */
+
+ /*
+ * Copy the descriptor data. The data is not valid until the
+ * state has been re-checked. A memcpy() for all of @desc
+ * cannot be used because of the atomic_t @state_var field.
+ */
+ if (desc_out) {
+ memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos,
+ sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */
+ }
+ if (seq_out)
+ *seq_out = info->seq; /* also part of desc_read:C */
+ if (caller_id_out)
+ *caller_id_out = info->caller_id; /* also part of desc_read:C */
+
+ /*
+ * 1. Guarantee the descriptor content is loaded before re-checking
+ * the state. This avoids reading an obsolete descriptor state
+ * that may not apply to the copied content. This pairs with
+ * desc_reserve:F.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_read:C reads from desc_reserve:G, then desc_read:E
+ * reads from desc_reserve:F.
+ *
+ * Relies on:
+ *
+ * WMB from desc_reserve:F to desc_reserve:G
+ * matching
+ * RMB from desc_read:C to desc_read:E
+ *
+ * 2. Guarantee the record data is loaded before re-checking the
+ * state. This avoids reading an obsolete descriptor state that may
+ * not apply to the copied data. This pairs with data_alloc:A and
+ * data_realloc:A.
+ *
+ * Memory barrier involvement:
+ *
+ * If copy_data:A reads from data_alloc:B, then desc_read:E
+ * reads from desc_make_reusable:A.
+ *
+ * Relies on:
+ *
+ * MB from desc_make_reusable:A to data_alloc:B
+ * matching
+ * RMB from desc_read:C to desc_read:E
+ *
+ * Note: desc_make_reusable:A and data_alloc:B can be different
+ * CPUs. However, the data_alloc:B CPU (which performs the
+ * full memory barrier) must have previously seen
+ * desc_make_reusable:A.
+ */
+ smp_rmb(); /* LMM(desc_read:D) */
+
+ /*
+ * The data has been copied. Return the current descriptor state,
+ * which may have changed since the load above.
+ */
+ state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */
+ d_state = get_desc_state(id, state_val);
+out:
+ if (desc_out)
+ atomic_long_set(&desc_out->state_var, state_val);
+ return d_state;
+}
+
+/*
+ * Take a specified descriptor out of the finalized state by attempting
+ * the transition from finalized to reusable. Either this context or some
+ * other context will have been successful.
+ */
+static void desc_make_reusable(struct prb_desc_ring *desc_ring,
+ unsigned long id)
+{
+ unsigned long val_finalized = DESC_SV(id, desc_finalized);
+ unsigned long val_reusable = DESC_SV(id, desc_reusable);
+ struct prb_desc *desc = to_desc(desc_ring, id);
+ atomic_long_t *state_var = &desc->state_var;
+
+ atomic_long_cmpxchg_relaxed(state_var, val_finalized,
+ val_reusable); /* LMM(desc_make_reusable:A) */
+}
+
+/*
+ * Given the text data ring, put the associated descriptor of each
+ * data block from @lpos_begin until @lpos_end into the reusable state.
+ *
+ * If there is any problem making the associated descriptor reusable, either
+ * the descriptor has not yet been finalized or another writer context has
+ * already pushed the tail lpos past the problematic data block. Regardless,
+ * on error the caller can re-load the tail lpos to determine the situation.
+ */
+static bool data_make_reusable(struct printk_ringbuffer *rb,
+ unsigned long lpos_begin,
+ unsigned long lpos_end,
+ unsigned long *lpos_out)
+{
+
+ struct prb_data_ring *data_ring = &rb->text_data_ring;
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ struct prb_data_block *blk;
+ enum desc_state d_state;
+ struct prb_desc desc;
+ struct prb_data_blk_lpos *blk_lpos = &desc.text_blk_lpos;
+ unsigned long id;
+
+ /* Loop until @lpos_begin has advanced to or beyond @lpos_end. */
+ while ((lpos_end - lpos_begin) - 1 < DATA_SIZE(data_ring)) {
+ blk = to_block(data_ring, lpos_begin);
+
+ /*
+ * Load the block ID from the data block. This is a data race
+ * against a writer that may have newly reserved this data
+ * area. If the loaded value matches a valid descriptor ID,
+ * the blk_lpos of that descriptor will be checked to make
+ * sure it points back to this data block. If the check fails,
+ * the data area has been recycled by another writer.
+ */
+ id = blk->id; /* LMM(data_make_reusable:A) */
+
+ d_state = desc_read(desc_ring, id, &desc,
+ NULL, NULL); /* LMM(data_make_reusable:B) */
+
+ switch (d_state) {
+ case desc_miss:
+ case desc_reserved:
+ case desc_committed:
+ return false;
+ case desc_finalized:
+ /*
+ * This data block is invalid if the descriptor
+ * does not point back to it.
+ */
+ if (blk_lpos->begin != lpos_begin)
+ return false;
+ desc_make_reusable(desc_ring, id);
+ break;
+ case desc_reusable:
+ /*
+ * This data block is invalid if the descriptor
+ * does not point back to it.
+ */
+ if (blk_lpos->begin != lpos_begin)
+ return false;
+ break;
+ }
+
+ /* Advance @lpos_begin to the next data block. */
+ lpos_begin = blk_lpos->next;
+ }
+
+ *lpos_out = lpos_begin;
+ return true;
+}
+
+/*
+ * Advance the data ring tail to at least @lpos. This function puts
+ * descriptors into the reusable state if the tail is pushed beyond
+ * their associated data block.
+ */
+static bool data_push_tail(struct printk_ringbuffer *rb, unsigned long lpos)
+{
+ struct prb_data_ring *data_ring = &rb->text_data_ring;
+ unsigned long tail_lpos_new;
+ unsigned long tail_lpos;
+ unsigned long next_lpos;
+
+ /* If @lpos is from a data-less block, there is nothing to do. */
+ if (LPOS_DATALESS(lpos))
+ return true;
+
+ /*
+ * Any descriptor states that have transitioned to reusable due to the
+ * data tail being pushed to this loaded value will be visible to this
+ * CPU. This pairs with data_push_tail:D.
+ *
+ * Memory barrier involvement:
+ *
+ * If data_push_tail:A reads from data_push_tail:D, then this CPU can
+ * see desc_make_reusable:A.
+ *
+ * Relies on:
+ *
+ * MB from desc_make_reusable:A to data_push_tail:D
+ * matches
+ * READFROM from data_push_tail:D to data_push_tail:A
+ * thus
+ * READFROM from desc_make_reusable:A to this CPU
+ */
+ tail_lpos = atomic_long_read(&data_ring->tail_lpos); /* LMM(data_push_tail:A) */
+
+ /*
+ * Loop until the tail lpos is at or beyond @lpos. This condition
+ * may already be satisfied, resulting in no full memory barrier
+ * from data_push_tail:D being performed. However, since this CPU
+ * sees the new tail lpos, any descriptor states that transitioned to
+ * the reusable state must already be visible.
+ */
+ while ((lpos - tail_lpos) - 1 < DATA_SIZE(data_ring)) {
+ /*
+ * Make all descriptors reusable that are associated with
+ * data blocks before @lpos.
+ */
+ if (!data_make_reusable(rb, tail_lpos, lpos, &next_lpos)) {
+ /*
+ * 1. Guarantee the block ID loaded in
+ * data_make_reusable() is performed before
+ * reloading the tail lpos. The failed
+ * data_make_reusable() may be due to a newly
+ * recycled data area causing the tail lpos to
+ * have been previously pushed. This pairs with
+ * data_alloc:A and data_realloc:A.
+ *
+ * Memory barrier involvement:
+ *
+ * If data_make_reusable:A reads from data_alloc:B,
+ * then data_push_tail:C reads from
+ * data_push_tail:D.
+ *
+ * Relies on:
+ *
+ * MB from data_push_tail:D to data_alloc:B
+ * matching
+ * RMB from data_make_reusable:A to
+ * data_push_tail:C
+ *
+ * Note: data_push_tail:D and data_alloc:B can be
+ * different CPUs. However, the data_alloc:B
+ * CPU (which performs the full memory
+ * barrier) must have previously seen
+ * data_push_tail:D.
+ *
+ * 2. Guarantee the descriptor state loaded in
+ * data_make_reusable() is performed before
+ * reloading the tail lpos. The failed
+ * data_make_reusable() may be due to a newly
+ * recycled descriptor causing the tail lpos to
+ * have been previously pushed. This pairs with
+ * desc_reserve:D.
+ *
+ * Memory barrier involvement:
+ *
+ * If data_make_reusable:B reads from
+ * desc_reserve:F, then data_push_tail:C reads
+ * from data_push_tail:D.
+ *
+ * Relies on:
+ *
+ * MB from data_push_tail:D to desc_reserve:F
+ * matching
+ * RMB from data_make_reusable:B to
+ * data_push_tail:C
+ *
+ * Note: data_push_tail:D and desc_reserve:F can
+ * be different CPUs. However, the
+ * desc_reserve:F CPU (which performs the
+ * full memory barrier) must have previously
+ * seen data_push_tail:D.
+ */
+ smp_rmb(); /* LMM(data_push_tail:B) */
+
+ tail_lpos_new = atomic_long_read(&data_ring->tail_lpos
+ ); /* LMM(data_push_tail:C) */
+ if (tail_lpos_new == tail_lpos)
+ return false;
+
+ /* Another CPU pushed the tail. Try again. */
+ tail_lpos = tail_lpos_new;
+ continue;
+ }
+
+ /*
+ * Guarantee any descriptor states that have transitioned to
+ * reusable are stored before pushing the tail lpos. A full
+ * memory barrier is needed since other CPUs may have made
+ * the descriptor states reusable. This pairs with
+ * data_push_tail:A.
+ */
+ if (atomic_long_try_cmpxchg(&data_ring->tail_lpos, &tail_lpos,
+ next_lpos)) { /* LMM(data_push_tail:D) */
+ break;
+ }
+ }
+
+ return true;
+}
+
+/*
+ * Advance the desc ring tail. This function advances the tail by one
+ * descriptor, thus invalidating the oldest descriptor. Before advancing
+ * the tail, the tail descriptor is made reusable and all data blocks up to
+ * and including the descriptor's data block are invalidated (i.e. the data
+ * ring tail is pushed past the data block of the descriptor being made
+ * reusable).
+ */
+static bool desc_push_tail(struct printk_ringbuffer *rb,
+ unsigned long tail_id)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ enum desc_state d_state;
+ struct prb_desc desc;
+
+ d_state = desc_read(desc_ring, tail_id, &desc, NULL, NULL);
+
+ switch (d_state) {
+ case desc_miss:
+ /*
+ * If the ID is exactly 1 wrap behind the expected, it is
+ * in the process of being reserved by another writer and
+ * must be considered reserved.
+ */
+ if (DESC_ID(atomic_long_read(&desc.state_var)) ==
+ DESC_ID_PREV_WRAP(desc_ring, tail_id)) {
+ return false;
+ }
+
+ /*
+ * The ID has changed. Another writer must have pushed the
+ * tail and recycled the descriptor already. Success is
+ * returned because the caller is only interested in the
+ * specified tail being pushed, which it was.
+ */
+ return true;
+ case desc_reserved:
+ case desc_committed:
+ return false;
+ case desc_finalized:
+ desc_make_reusable(desc_ring, tail_id);
+ break;
+ case desc_reusable:
+ break;
+ }
+
+ /*
+ * Data blocks must be invalidated before their associated
+ * descriptor can be made available for recycling. Invalidating
+ * them later is not possible because there is no way to trust
+ * data blocks once their associated descriptor is gone.
+ */
+
+ if (!data_push_tail(rb, desc.text_blk_lpos.next))
+ return false;
+
+ /*
+ * Check the next descriptor after @tail_id before pushing the tail
+ * to it because the tail must always be in a finalized or reusable
+ * state. The implementation of prb_first_seq() relies on this.
+ *
+ * A successful read implies that the next descriptor is less than or
+ * equal to @head_id so there is no risk of pushing the tail past the
+ * head.
+ */
+ d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc,
+ NULL, NULL); /* LMM(desc_push_tail:A) */
+
+ if (d_state == desc_finalized || d_state == desc_reusable) {
+ /*
+ * Guarantee any descriptor states that have transitioned to
+ * reusable are stored before pushing the tail ID. This allows
+ * verifying the recycled descriptor state. A full memory
+ * barrier is needed since other CPUs may have made the
+ * descriptor states reusable. This pairs with desc_reserve:D.
+ */
+ atomic_long_cmpxchg(&desc_ring->tail_id, tail_id,
+ DESC_ID(tail_id + 1)); /* LMM(desc_push_tail:B) */
+ } else {
+ /*
+ * Guarantee the last state load from desc_read() is before
+ * reloading @tail_id in order to see a new tail ID in the
+ * case that the descriptor has been recycled. This pairs
+ * with desc_reserve:D.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_push_tail:A reads from desc_reserve:F, then
+ * desc_push_tail:D reads from desc_push_tail:B.
+ *
+ * Relies on:
+ *
+ * MB from desc_push_tail:B to desc_reserve:F
+ * matching
+ * RMB from desc_push_tail:A to desc_push_tail:D
+ *
+ * Note: desc_push_tail:B and desc_reserve:F can be different
+ * CPUs. However, the desc_reserve:F CPU (which performs
+ * the full memory barrier) must have previously seen
+ * desc_push_tail:B.
+ */
+ smp_rmb(); /* LMM(desc_push_tail:C) */
+
+ /*
+ * Re-check the tail ID. The descriptor following @tail_id is
+ * not in an allowed tail state. But if the tail has since
+ * been moved by another CPU, then it does not matter.
+ */
+ if (atomic_long_read(&desc_ring->tail_id) == tail_id) /* LMM(desc_push_tail:D) */
+ return false;
+ }
+
+ return true;
+}
+
+/* Reserve a new descriptor, invalidating the oldest if necessary. */
+static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ unsigned long prev_state_val;
+ unsigned long id_prev_wrap;
+ struct prb_desc *desc;
+ unsigned long head_id;
+ unsigned long id;
+
+ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */
+
+ do {
+ id = DESC_ID(head_id + 1);
+ id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id);
+
+ /*
+ * Guarantee the head ID is read before reading the tail ID.
+ * Since the tail ID is updated before the head ID, this
+ * guarantees that @id_prev_wrap is never ahead of the tail
+ * ID. This pairs with desc_reserve:D.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_reserve:A reads from desc_reserve:D, then
+ * desc_reserve:C reads from desc_push_tail:B.
+ *
+ * Relies on:
+ *
+ * MB from desc_push_tail:B to desc_reserve:D
+ * matching
+ * RMB from desc_reserve:A to desc_reserve:C
+ *
+ * Note: desc_push_tail:B and desc_reserve:D can be different
+ * CPUs. However, the desc_reserve:D CPU (which performs
+ * the full memory barrier) must have previously seen
+ * desc_push_tail:B.
+ */
+ smp_rmb(); /* LMM(desc_reserve:B) */
+
+ if (id_prev_wrap == atomic_long_read(&desc_ring->tail_id
+ )) { /* LMM(desc_reserve:C) */
+ /*
+ * Make space for the new descriptor by
+ * advancing the tail.
+ */
+ if (!desc_push_tail(rb, id_prev_wrap))
+ return false;
+ }
+
+ /*
+ * 1. Guarantee the tail ID is read before validating the
+ * recycled descriptor state. A read memory barrier is
+ * sufficient for this. This pairs with desc_push_tail:B.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_reserve:C reads from desc_push_tail:B, then
+ * desc_reserve:E reads from desc_make_reusable:A.
+ *
+ * Relies on:
+ *
+ * MB from desc_make_reusable:A to desc_push_tail:B
+ * matching
+ * RMB from desc_reserve:C to desc_reserve:E
+ *
+ * Note: desc_make_reusable:A and desc_push_tail:B can be
+ * different CPUs. However, the desc_push_tail:B CPU
+ * (which performs the full memory barrier) must have
+ * previously seen desc_make_reusable:A.
+ *
+ * 2. Guarantee the tail ID is stored before storing the head
+ * ID. This pairs with desc_reserve:B.
+ *
+ * 3. Guarantee any data ring tail changes are stored before
+ * recycling the descriptor. Data ring tail changes can
+ * happen via desc_push_tail()->data_push_tail(). A full
+ * memory barrier is needed since another CPU may have
+ * pushed the data ring tails. This pairs with
+ * data_push_tail:B.
+ *
+ * 4. Guarantee a new tail ID is stored before recycling the
+ * descriptor. A full memory barrier is needed since
+ * another CPU may have pushed the tail ID. This pairs
+ * with desc_push_tail:C and this also pairs with
+ * prb_first_seq:C.
+ *
+ * 5. Guarantee the head ID is stored before trying to
+ * finalize the previous descriptor. This pairs with
+ * _prb_commit:B.
+ */
+ } while (!atomic_long_try_cmpxchg(&desc_ring->head_id, &head_id,
+ id)); /* LMM(desc_reserve:D) */
+
+ desc = to_desc(desc_ring, id);
+
+ /*
+ * If the descriptor has been recycled, verify the old state val.
+ * See "ABA Issues" about why this verification is performed.
+ */
+ prev_state_val = atomic_long_read(&desc->state_var); /* LMM(desc_reserve:E) */
+ if (prev_state_val &&
+ get_desc_state(id_prev_wrap, prev_state_val) != desc_reusable) {
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ /*
+ * Assign the descriptor a new ID and set its state to reserved.
+ * See "ABA Issues" about why cmpxchg() instead of set() is used.
+ *
+ * Guarantee the new descriptor ID and state is stored before making
+ * any other changes. A write memory barrier is sufficient for this.
+ * This pairs with desc_read:D.
+ */
+ if (!atomic_long_try_cmpxchg(&desc->state_var, &prev_state_val,
+ DESC_SV(id, desc_reserved))) { /* LMM(desc_reserve:F) */
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ /* Now data in @desc can be modified: LMM(desc_reserve:G) */
+
+ *id_out = id;
+ return true;
+}
+
+/* Determine the end of a data block. */
+static unsigned long get_next_lpos(struct prb_data_ring *data_ring,
+ unsigned long lpos, unsigned int size)
+{
+ unsigned long begin_lpos;
+ unsigned long next_lpos;
+
+ begin_lpos = lpos;
+ next_lpos = lpos + size;
+
+ /* First check if the data block does not wrap. */
+ if (DATA_WRAPS(data_ring, begin_lpos) == DATA_WRAPS(data_ring, next_lpos))
+ return next_lpos;
+
+ /* Wrapping data blocks store their data at the beginning. */
+ return (DATA_THIS_WRAP_START_LPOS(data_ring, next_lpos) + size);
+}
+
+/*
+ * Allocate a new data block, invalidating the oldest data block(s)
+ * if necessary. This function also associates the data block with
+ * a specified descriptor.
+ */
+static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size,
+ struct prb_data_blk_lpos *blk_lpos, unsigned long id)
+{
+ struct prb_data_ring *data_ring = &rb->text_data_ring;
+ struct prb_data_block *blk;
+ unsigned long begin_lpos;
+ unsigned long next_lpos;
+
+ if (size == 0) {
+ /*
+ * Data blocks are not created for empty lines. Instead, the
+ * reader will recognize these special lpos values and handle
+ * it appropriately.
+ */
+ blk_lpos->begin = EMPTY_LINE_LPOS;
+ blk_lpos->next = EMPTY_LINE_LPOS;
+ return NULL;
+ }
+
+ size = to_blk_size(size);
+
+ begin_lpos = atomic_long_read(&data_ring->head_lpos);
+
+ do {
+ next_lpos = get_next_lpos(data_ring, begin_lpos, size);
+
+ if (!data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) {
+ /* Failed to allocate, specify a data-less block. */
+ blk_lpos->begin = FAILED_LPOS;
+ blk_lpos->next = FAILED_LPOS;
+ return NULL;
+ }
+
+ /*
+ * 1. Guarantee any descriptor states that have transitioned
+ * to reusable are stored before modifying the newly
+ * allocated data area. A full memory barrier is needed
+ * since other CPUs may have made the descriptor states
+ * reusable. See data_push_tail:A about why the reusable
+ * states are visible. This pairs with desc_read:D.
+ *
+ * 2. Guarantee any updated tail lpos is stored before
+ * modifying the newly allocated data area. Another CPU may
+ * be in data_make_reusable() and is reading a block ID
+ * from this area. data_make_reusable() can handle reading
+ * a garbage block ID value, but then it must be able to
+ * load a new tail lpos. A full memory barrier is needed
+ * since other CPUs may have updated the tail lpos. This
+ * pairs with data_push_tail:B.
+ */
+ } while (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &begin_lpos,
+ next_lpos)); /* LMM(data_alloc:A) */
+
+ blk = to_block(data_ring, begin_lpos);
+ blk->id = id; /* LMM(data_alloc:B) */
+
+ if (DATA_WRAPS(data_ring, begin_lpos) != DATA_WRAPS(data_ring, next_lpos)) {
+ /* Wrapping data blocks store their data at the beginning. */
+ blk = to_block(data_ring, 0);
+
+ /*
+ * Store the ID on the wrapped block for consistency.
+ * The printk_ringbuffer does not actually use it.
+ */
+ blk->id = id;
+ }
+
+ blk_lpos->begin = begin_lpos;
+ blk_lpos->next = next_lpos;
+
+ return &blk->data[0];
+}
+
+/*
+ * Try to resize an existing data block associated with the descriptor
+ * specified by @id. If the resized data block should become wrapped, it
+ * copies the old data to the new data block. If @size yields a data block
+ * with the same or less size, the data block is left as is.
+ *
+ * Fail if this is not the last allocated data block or if there is not
+ * enough space or it is not possible make enough space.
+ *
+ * Return a pointer to the beginning of the entire data buffer or NULL on
+ * failure.
+ */
+static char *data_realloc(struct printk_ringbuffer *rb, unsigned int size,
+ struct prb_data_blk_lpos *blk_lpos, unsigned long id)
+{
+ struct prb_data_ring *data_ring = &rb->text_data_ring;
+ struct prb_data_block *blk;
+ unsigned long head_lpos;
+ unsigned long next_lpos;
+ bool wrapped;
+
+ /* Reallocation only works if @blk_lpos is the newest data block. */
+ head_lpos = atomic_long_read(&data_ring->head_lpos);
+ if (head_lpos != blk_lpos->next)
+ return NULL;
+
+ /* Keep track if @blk_lpos was a wrapping data block. */
+ wrapped = (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, blk_lpos->next));
+
+ size = to_blk_size(size);
+
+ next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size);
+
+ /* If the data block does not increase, there is nothing to do. */
+ if (head_lpos - next_lpos < DATA_SIZE(data_ring)) {
+ if (wrapped)
+ blk = to_block(data_ring, 0);
+ else
+ blk = to_block(data_ring, blk_lpos->begin);
+ return &blk->data[0];
+ }
+
+ if (!data_push_tail(rb, next_lpos - DATA_SIZE(data_ring)))
+ return NULL;
+
+ /* The memory barrier involvement is the same as data_alloc:A. */
+ if (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &head_lpos,
+ next_lpos)) { /* LMM(data_realloc:A) */
+ return NULL;
+ }
+
+ blk = to_block(data_ring, blk_lpos->begin);
+
+ if (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, next_lpos)) {
+ struct prb_data_block *old_blk = blk;
+
+ /* Wrapping data blocks store their data at the beginning. */
+ blk = to_block(data_ring, 0);
+
+ /*
+ * Store the ID on the wrapped block for consistency.
+ * The printk_ringbuffer does not actually use it.
+ */
+ blk->id = id;
+
+ if (!wrapped) {
+ /*
+ * Since the allocated space is now in the newly
+ * created wrapping data block, copy the content
+ * from the old data block.
+ */
+ memcpy(&blk->data[0], &old_blk->data[0],
+ (blk_lpos->next - blk_lpos->begin) - sizeof(blk->id));
+ }
+ }
+
+ blk_lpos->next = next_lpos;
+
+ return &blk->data[0];
+}
+
+/* Return the number of bytes used by a data block. */
+static unsigned int space_used(struct prb_data_ring *data_ring,
+ struct prb_data_blk_lpos *blk_lpos)
+{
+ /* Data-less blocks take no space. */
+ if (BLK_DATALESS(blk_lpos))
+ return 0;
+
+ if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) {
+ /* Data block does not wrap. */
+ return (DATA_INDEX(data_ring, blk_lpos->next) -
+ DATA_INDEX(data_ring, blk_lpos->begin));
+ }
+
+ /*
+ * For wrapping data blocks, the trailing (wasted) space is
+ * also counted.
+ */
+ return (DATA_INDEX(data_ring, blk_lpos->next) +
+ DATA_SIZE(data_ring) - DATA_INDEX(data_ring, blk_lpos->begin));
+}
+
+/*
+ * Given @blk_lpos, return a pointer to the writer data from the data block
+ * and calculate the size of the data part. A NULL pointer is returned if
+ * @blk_lpos specifies values that could never be legal.
+ *
+ * This function (used by readers) performs strict validation on the lpos
+ * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is
+ * triggered if an internal error is detected.
+ */
+static const char *get_data(struct prb_data_ring *data_ring,
+ struct prb_data_blk_lpos *blk_lpos,
+ unsigned int *data_size)
+{
+ struct prb_data_block *db;
+
+ /* Data-less data block description. */
+ if (BLK_DATALESS(blk_lpos)) {
+ /*
+ * Records that are just empty lines are also valid, even
+ * though they do not have a data block. For such records
+ * explicitly return empty string data to signify success.
+ */
+ if (blk_lpos->begin == EMPTY_LINE_LPOS &&
+ blk_lpos->next == EMPTY_LINE_LPOS) {
+ *data_size = 0;
+ return "";
+ }
+
+ /* Data lost, invalid, or otherwise unavailable. */
+ return NULL;
+ }
+
+ /* Regular data block: @begin less than @next and in same wrap. */
+ if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) &&
+ blk_lpos->begin < blk_lpos->next) {
+ db = to_block(data_ring, blk_lpos->begin);
+ *data_size = blk_lpos->next - blk_lpos->begin;
+
+ /* Wrapping data block: @begin is one wrap behind @next. */
+ } else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) ==
+ DATA_WRAPS(data_ring, blk_lpos->next)) {
+ db = to_block(data_ring, 0);
+ *data_size = DATA_INDEX(data_ring, blk_lpos->next);
+
+ /* Illegal block description. */
+ } else {
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+
+ /* A valid data block will always be aligned to the ID size. */
+ if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) ||
+ WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) {
+ return NULL;
+ }
+
+ /* A valid data block will always have at least an ID. */
+ if (WARN_ON_ONCE(*data_size < sizeof(db->id)))
+ return NULL;
+
+ /* Subtract block ID space from size to reflect data size. */
+ *data_size -= sizeof(db->id);
+
+ return &db->data[0];
+}
+
+/*
+ * Attempt to transition the newest descriptor from committed back to reserved
+ * so that the record can be modified by a writer again. This is only possible
+ * if the descriptor is not yet finalized and the provided @caller_id matches.
+ */
+static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring,
+ u32 caller_id, unsigned long *id_out)
+{
+ unsigned long prev_state_val;
+ enum desc_state d_state;
+ struct prb_desc desc;
+ struct prb_desc *d;
+ unsigned long id;
+ u32 cid;
+
+ id = atomic_long_read(&desc_ring->head_id);
+
+ /*
+ * To reduce unnecessarily reopening, first check if the descriptor
+ * state and caller ID are correct.
+ */
+ d_state = desc_read(desc_ring, id, &desc, NULL, &cid);
+ if (d_state != desc_committed || cid != caller_id)
+ return NULL;
+
+ d = to_desc(desc_ring, id);
+
+ prev_state_val = DESC_SV(id, desc_committed);
+
+ /*
+ * Guarantee the reserved state is stored before reading any
+ * record data. A full memory barrier is needed because @state_var
+ * modification is followed by reading. This pairs with _prb_commit:B.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_reopen_last:A reads from _prb_commit:B, then
+ * prb_reserve_in_last:A reads from _prb_commit:A.
+ *
+ * Relies on:
+ *
+ * WMB from _prb_commit:A to _prb_commit:B
+ * matching
+ * MB If desc_reopen_last:A to prb_reserve_in_last:A
+ */
+ if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val,
+ DESC_SV(id, desc_reserved))) { /* LMM(desc_reopen_last:A) */
+ return NULL;
+ }
+
+ *id_out = id;
+ return d;
+}
+
+/**
+ * prb_reserve_in_last() - Re-reserve and extend the space in the ringbuffer
+ * used by the newest record.
+ *
+ * @e: The entry structure to setup.
+ * @rb: The ringbuffer to re-reserve and extend data in.
+ * @r: The record structure to allocate buffers for.
+ * @caller_id: The caller ID of the caller (reserving writer).
+ * @max_size: Fail if the extended size would be greater than this.
+ *
+ * This is the public function available to writers to re-reserve and extend
+ * data.
+ *
+ * The writer specifies the text size to extend (not the new total size) by
+ * setting the @text_buf_size field of @r. To ensure proper initialization
+ * of @r, prb_rec_init_wr() should be used.
+ *
+ * This function will fail if @caller_id does not match the caller ID of the
+ * newest record. In that case the caller must reserve new data using
+ * prb_reserve().
+ *
+ * Context: Any context. Disables local interrupts on success.
+ * Return: true if text data could be extended, otherwise false.
+ *
+ * On success:
+ *
+ * - @r->text_buf points to the beginning of the entire text buffer.
+ *
+ * - @r->text_buf_size is set to the new total size of the buffer.
+ *
+ * - @r->info is not touched so that @r->info->text_len could be used
+ * to append the text.
+ *
+ * - prb_record_text_space() can be used on @e to query the new
+ * actually used space.
+ *
+ * Important: All @r->info fields will already be set with the current values
+ * for the record. I.e. @r->info->text_len will be less than
+ * @text_buf_size. Writers can use @r->info->text_len to know
+ * where concatenation begins and writers should update
+ * @r->info->text_len after concatenating.
+ */
+bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
+ struct printk_record *r, u32 caller_id, unsigned int max_size)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ struct printk_info *info;
+ unsigned int data_size;
+ struct prb_desc *d;
+ unsigned long id;
+
+ local_irq_save(e->irqflags);
+
+ /* Transition the newest descriptor back to the reserved state. */
+ d = desc_reopen_last(desc_ring, caller_id, &id);
+ if (!d) {
+ local_irq_restore(e->irqflags);
+ goto fail_reopen;
+ }
+
+ /* Now the writer has exclusive access: LMM(prb_reserve_in_last:A) */
+
+ info = to_info(desc_ring, id);
+
+ /*
+ * Set the @e fields here so that prb_commit() can be used if
+ * anything fails from now on.
+ */
+ e->rb = rb;
+ e->id = id;
+
+ /*
+ * desc_reopen_last() checked the caller_id, but there was no
+ * exclusive access at that point. The descriptor may have
+ * changed since then.
+ */
+ if (caller_id != info->caller_id)
+ goto fail;
+
+ if (BLK_DATALESS(&d->text_blk_lpos)) {
+ if (WARN_ON_ONCE(info->text_len != 0)) {
+ pr_warn_once("wrong text_len value (%hu, expecting 0)\n",
+ info->text_len);
+ info->text_len = 0;
+ }
+
+ if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
+ goto fail;
+
+ if (r->text_buf_size > max_size)
+ goto fail;
+
+ r->text_buf = data_alloc(rb, r->text_buf_size,
+ &d->text_blk_lpos, id);
+ } else {
+ if (!get_data(&rb->text_data_ring, &d->text_blk_lpos, &data_size))
+ goto fail;
+
+ /*
+ * Increase the buffer size to include the original size. If
+ * the meta data (@text_len) is not sane, use the full data
+ * block size.
+ */
+ if (WARN_ON_ONCE(info->text_len > data_size)) {
+ pr_warn_once("wrong text_len value (%hu, expecting <=%u)\n",
+ info->text_len, data_size);
+ info->text_len = data_size;
+ }
+ r->text_buf_size += info->text_len;
+
+ if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
+ goto fail;
+
+ if (r->text_buf_size > max_size)
+ goto fail;
+
+ r->text_buf = data_realloc(rb, r->text_buf_size,
+ &d->text_blk_lpos, id);
+ }
+ if (r->text_buf_size && !r->text_buf)
+ goto fail;
+
+ r->info = info;
+
+ e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos);
+
+ return true;
+fail:
+ prb_commit(e);
+ /* prb_commit() re-enabled interrupts. */
+fail_reopen:
+ /* Make it clear to the caller that the re-reserve failed. */
+ memset(r, 0, sizeof(*r));
+ return false;
+}
+
+/*
+ * @last_finalized_seq value guarantees that all records up to and including
+ * this sequence number are finalized and can be read. The only exception are
+ * too old records which have already been overwritten.
+ *
+ * It is also guaranteed that @last_finalized_seq only increases.
+ *
+ * Be aware that finalized records following non-finalized records are not
+ * reported because they are not yet available to the reader. For example,
+ * a new record stored via printk() will not be available to a printer if
+ * it follows a record that has not been finalized yet. However, once that
+ * non-finalized record becomes finalized, @last_finalized_seq will be
+ * appropriately updated and the full set of finalized records will be
+ * available to the printer. And since each printk() caller will either
+ * directly print or trigger deferred printing of all available unprinted
+ * records, all printk() messages will get printed.
+ */
+static u64 desc_last_finalized_seq(struct printk_ringbuffer *rb)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ unsigned long ulseq;
+
+ /*
+ * Guarantee the sequence number is loaded before loading the
+ * associated record in order to guarantee that the record can be
+ * seen by this CPU. This pairs with desc_update_last_finalized:A.
+ */
+ ulseq = atomic_long_read_acquire(&desc_ring->last_finalized_seq
+ ); /* LMM(desc_last_finalized_seq:A) */
+
+ return __ulseq_to_u64seq(rb, ulseq);
+}
+
+static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
+ struct printk_record *r, unsigned int *line_count);
+
+/*
+ * Check if there are records directly following @last_finalized_seq that are
+ * finalized. If so, update @last_finalized_seq to the latest of these
+ * records. It is not allowed to skip over records that are not yet finalized.
+ */
+static void desc_update_last_finalized(struct printk_ringbuffer *rb)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ u64 old_seq = desc_last_finalized_seq(rb);
+ unsigned long oldval;
+ unsigned long newval;
+ u64 finalized_seq;
+ u64 try_seq;
+
+try_again:
+ finalized_seq = old_seq;
+ try_seq = finalized_seq + 1;
+
+ /* Try to find later finalized records. */
+ while (_prb_read_valid(rb, &try_seq, NULL, NULL)) {
+ finalized_seq = try_seq;
+ try_seq++;
+ }
+
+ /* No update needed if no later finalized record was found. */
+ if (finalized_seq == old_seq)
+ return;
+
+ oldval = __u64seq_to_ulseq(old_seq);
+ newval = __u64seq_to_ulseq(finalized_seq);
+
+ /*
+ * Set the sequence number of a later finalized record that has been
+ * seen.
+ *
+ * Guarantee the record data is visible to other CPUs before storing
+ * its sequence number. This pairs with desc_last_finalized_seq:A.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_last_finalized_seq:A reads from
+ * desc_update_last_finalized:A, then desc_read:A reads from
+ * _prb_commit:B.
+ *
+ * Relies on:
+ *
+ * RELEASE from _prb_commit:B to desc_update_last_finalized:A
+ * matching
+ * ACQUIRE from desc_last_finalized_seq:A to desc_read:A
+ *
+ * Note: _prb_commit:B and desc_update_last_finalized:A can be
+ * different CPUs. However, the desc_update_last_finalized:A
+ * CPU (which performs the release) must have previously seen
+ * _prb_commit:B.
+ */
+ if (!atomic_long_try_cmpxchg_release(&desc_ring->last_finalized_seq,
+ &oldval, newval)) { /* LMM(desc_update_last_finalized:A) */
+ old_seq = __ulseq_to_u64seq(rb, oldval);
+ goto try_again;
+ }
+}
+
+/*
+ * Attempt to finalize a specified descriptor. If this fails, the descriptor
+ * is either already final or it will finalize itself when the writer commits.
+ */
+static void desc_make_final(struct printk_ringbuffer *rb, unsigned long id)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ unsigned long prev_state_val = DESC_SV(id, desc_committed);
+ struct prb_desc *d = to_desc(desc_ring, id);
+
+ if (atomic_long_try_cmpxchg_relaxed(&d->state_var, &prev_state_val,
+ DESC_SV(id, desc_finalized))) { /* LMM(desc_make_final:A) */
+ desc_update_last_finalized(rb);
+ }
+}
+
+/**
+ * prb_reserve() - Reserve space in the ringbuffer.
+ *
+ * @e: The entry structure to setup.
+ * @rb: The ringbuffer to reserve data in.
+ * @r: The record structure to allocate buffers for.
+ *
+ * This is the public function available to writers to reserve data.
+ *
+ * The writer specifies the text size to reserve by setting the
+ * @text_buf_size field of @r. To ensure proper initialization of @r,
+ * prb_rec_init_wr() should be used.
+ *
+ * Context: Any context. Disables local interrupts on success.
+ * Return: true if at least text data could be allocated, otherwise false.
+ *
+ * On success, the fields @info and @text_buf of @r will be set by this
+ * function and should be filled in by the writer before committing. Also
+ * on success, prb_record_text_space() can be used on @e to query the actual
+ * space used for the text data block.
+ *
+ * Important: @info->text_len needs to be set correctly by the writer in
+ * order for data to be readable and/or extended. Its value
+ * is initialized to 0.
+ */
+bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
+ struct printk_record *r)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ struct printk_info *info;
+ struct prb_desc *d;
+ unsigned long id;
+ u64 seq;
+
+ if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
+ goto fail;
+
+ /*
+ * Descriptors in the reserved state act as blockers to all further
+ * reservations once the desc_ring has fully wrapped. Disable
+ * interrupts during the reserve/commit window in order to minimize
+ * the likelihood of this happening.
+ */
+ local_irq_save(e->irqflags);
+
+ if (!desc_reserve(rb, &id)) {
+ /* Descriptor reservation failures are tracked. */
+ atomic_long_inc(&rb->fail);
+ local_irq_restore(e->irqflags);
+ goto fail;
+ }
+
+ d = to_desc(desc_ring, id);
+ info = to_info(desc_ring, id);
+
+ /*
+ * All @info fields (except @seq) are cleared and must be filled in
+ * by the writer. Save @seq before clearing because it is used to
+ * determine the new sequence number.
+ */
+ seq = info->seq;
+ memset(info, 0, sizeof(*info));
+
+ /*
+ * Set the @e fields here so that prb_commit() can be used if
+ * text data allocation fails.
+ */
+ e->rb = rb;
+ e->id = id;
+
+ /*
+ * Initialize the sequence number if it has "never been set".
+ * Otherwise just increment it by a full wrap.
+ *
+ * @seq is considered "never been set" if it has a value of 0,
+ * _except_ for @infos[0], which was specially setup by the ringbuffer
+ * initializer and therefore is always considered as set.
+ *
+ * See the "Bootstrap" comment block in printk_ringbuffer.h for
+ * details about how the initializer bootstraps the descriptors.
+ */
+ if (seq == 0 && DESC_INDEX(desc_ring, id) != 0)
+ info->seq = DESC_INDEX(desc_ring, id);
+ else
+ info->seq = seq + DESCS_COUNT(desc_ring);
+
+ /*
+ * New data is about to be reserved. Once that happens, previous
+ * descriptors are no longer able to be extended. Finalize the
+ * previous descriptor now so that it can be made available to
+ * readers. (For seq==0 there is no previous descriptor.)
+ */
+ if (info->seq > 0)
+ desc_make_final(rb, DESC_ID(id - 1));
+
+ r->text_buf = data_alloc(rb, r->text_buf_size, &d->text_blk_lpos, id);
+ /* If text data allocation fails, a data-less record is committed. */
+ if (r->text_buf_size && !r->text_buf) {
+ prb_commit(e);
+ /* prb_commit() re-enabled interrupts. */
+ goto fail;
+ }
+
+ r->info = info;
+
+ /* Record full text space used by record. */
+ e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos);
+
+ return true;
+fail:
+ /* Make it clear to the caller that the reserve failed. */
+ memset(r, 0, sizeof(*r));
+ return false;
+}
+
+/* Commit the data (possibly finalizing it) and restore interrupts. */
+static void _prb_commit(struct prb_reserved_entry *e, unsigned long state_val)
+{
+ struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
+ struct prb_desc *d = to_desc(desc_ring, e->id);
+ unsigned long prev_state_val = DESC_SV(e->id, desc_reserved);
+
+ /* Now the writer has finished all writing: LMM(_prb_commit:A) */
+
+ /*
+ * Set the descriptor as committed. See "ABA Issues" about why
+ * cmpxchg() instead of set() is used.
+ *
+ * 1 Guarantee all record data is stored before the descriptor state
+ * is stored as committed. A write memory barrier is sufficient
+ * for this. This pairs with desc_read:B and desc_reopen_last:A.
+ *
+ * 2. Guarantee the descriptor state is stored as committed before
+ * re-checking the head ID in order to possibly finalize this
+ * descriptor. This pairs with desc_reserve:D.
+ *
+ * Memory barrier involvement:
+ *
+ * If prb_commit:A reads from desc_reserve:D, then
+ * desc_make_final:A reads from _prb_commit:B.
+ *
+ * Relies on:
+ *
+ * MB _prb_commit:B to prb_commit:A
+ * matching
+ * MB desc_reserve:D to desc_make_final:A
+ */
+ if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val,
+ DESC_SV(e->id, state_val))) { /* LMM(_prb_commit:B) */
+ WARN_ON_ONCE(1);
+ }
+
+ /* Restore interrupts, the reserve/commit window is finished. */
+ local_irq_restore(e->irqflags);
+}
+
+/**
+ * prb_commit() - Commit (previously reserved) data to the ringbuffer.
+ *
+ * @e: The entry containing the reserved data information.
+ *
+ * This is the public function available to writers to commit data.
+ *
+ * Note that the data is not yet available to readers until it is finalized.
+ * Finalizing happens automatically when space for the next record is
+ * reserved.
+ *
+ * See prb_final_commit() for a version of this function that finalizes
+ * immediately.
+ *
+ * Context: Any context. Enables local interrupts.
+ */
+void prb_commit(struct prb_reserved_entry *e)
+{
+ struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
+ unsigned long head_id;
+
+ _prb_commit(e, desc_committed);
+
+ /*
+ * If this descriptor is no longer the head (i.e. a new record has
+ * been allocated), extending the data for this record is no longer
+ * allowed and therefore it must be finalized.
+ */
+ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */
+ if (head_id != e->id)
+ desc_make_final(e->rb, e->id);
+}
+
+/**
+ * prb_final_commit() - Commit and finalize (previously reserved) data to
+ * the ringbuffer.
+ *
+ * @e: The entry containing the reserved data information.
+ *
+ * This is the public function available to writers to commit+finalize data.
+ *
+ * By finalizing, the data is made immediately available to readers.
+ *
+ * This function should only be used if there are no intentions of extending
+ * this data using prb_reserve_in_last().
+ *
+ * Context: Any context. Enables local interrupts.
+ */
+void prb_final_commit(struct prb_reserved_entry *e)
+{
+ _prb_commit(e, desc_finalized);
+
+ desc_update_last_finalized(e->rb);
+}
+
+/*
+ * Count the number of lines in provided text. All text has at least 1 line
+ * (even if @text_size is 0). Each '\n' processed is counted as an additional
+ * line.
+ */
+static unsigned int count_lines(const char *text, unsigned int text_size)
+{
+ unsigned int next_size = text_size;
+ unsigned int line_count = 1;
+ const char *next = text;
+
+ while (next_size) {
+ next = memchr(next, '\n', next_size);
+ if (!next)
+ break;
+ line_count++;
+ next++;
+ next_size = text_size - (next - text);
+ }
+
+ return line_count;
+}
+
+/*
+ * Given @blk_lpos, copy an expected @len of data into the provided buffer.
+ * If @line_count is provided, count the number of lines in the data.
+ *
+ * This function (used by readers) performs strict validation on the data
+ * size to possibly detect bugs in the writer code. A WARN_ON_ONCE() is
+ * triggered if an internal error is detected.
+ */
+static bool copy_data(struct prb_data_ring *data_ring,
+ struct prb_data_blk_lpos *blk_lpos, u16 len, char *buf,
+ unsigned int buf_size, unsigned int *line_count)
+{
+ unsigned int data_size;
+ const char *data;
+
+ /* Caller might not want any data. */
+ if ((!buf || !buf_size) && !line_count)
+ return true;
+
+ data = get_data(data_ring, blk_lpos, &data_size);
+ if (!data)
+ return false;
+
+ /*
+ * Actual cannot be less than expected. It can be more than expected
+ * because of the trailing alignment padding.
+ *
+ * Note that invalid @len values can occur because the caller loads
+ * the value during an allowed data race.
+ */
+ if (data_size < (unsigned int)len)
+ return false;
+
+ /* Caller interested in the line count? */
+ if (line_count)
+ *line_count = count_lines(data, len);
+
+ /* Caller interested in the data content? */
+ if (!buf || !buf_size)
+ return true;
+
+ data_size = min_t(unsigned int, buf_size, len);
+
+ memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */
+ return true;
+}
+
+/*
+ * This is an extended version of desc_read(). It gets a copy of a specified
+ * descriptor. However, it also verifies that the record is finalized and has
+ * the sequence number @seq. On success, 0 is returned.
+ *
+ * Error return values:
+ * -EINVAL: A finalized record with sequence number @seq does not exist.
+ * -ENOENT: A finalized record with sequence number @seq exists, but its data
+ * is not available. This is a valid record, so readers should
+ * continue with the next record.
+ */
+static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring,
+ unsigned long id, u64 seq,
+ struct prb_desc *desc_out)
+{
+ struct prb_data_blk_lpos *blk_lpos = &desc_out->text_blk_lpos;
+ enum desc_state d_state;
+ u64 s;
+
+ d_state = desc_read(desc_ring, id, desc_out, &s, NULL);
+
+ /*
+ * An unexpected @id (desc_miss) or @seq mismatch means the record
+ * does not exist. A descriptor in the reserved or committed state
+ * means the record does not yet exist for the reader.
+ */
+ if (d_state == desc_miss ||
+ d_state == desc_reserved ||
+ d_state == desc_committed ||
+ s != seq) {
+ return -EINVAL;
+ }
+
+ /*
+ * A descriptor in the reusable state may no longer have its data
+ * available; report it as existing but with lost data. Or the record
+ * may actually be a record with lost data.
+ */
+ if (d_state == desc_reusable ||
+ (blk_lpos->begin == FAILED_LPOS && blk_lpos->next == FAILED_LPOS)) {
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+/*
+ * Copy the ringbuffer data from the record with @seq to the provided
+ * @r buffer. On success, 0 is returned.
+ *
+ * See desc_read_finalized_seq() for error return values.
+ */
+static int prb_read(struct printk_ringbuffer *rb, u64 seq,
+ struct printk_record *r, unsigned int *line_count)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ struct printk_info *info = to_info(desc_ring, seq);
+ struct prb_desc *rdesc = to_desc(desc_ring, seq);
+ atomic_long_t *state_var = &rdesc->state_var;
+ struct prb_desc desc;
+ unsigned long id;
+ int err;
+
+ /* Extract the ID, used to specify the descriptor to read. */
+ id = DESC_ID(atomic_long_read(state_var));
+
+ /* Get a local copy of the correct descriptor (if available). */
+ err = desc_read_finalized_seq(desc_ring, id, seq, &desc);
+
+ /*
+ * If @r is NULL, the caller is only interested in the availability
+ * of the record.
+ */
+ if (err || !r)
+ return err;
+
+ /* If requested, copy meta data. */
+ if (r->info)
+ memcpy(r->info, info, sizeof(*(r->info)));
+
+ /* Copy text data. If it fails, this is a data-less record. */
+ if (!copy_data(&rb->text_data_ring, &desc.text_blk_lpos, info->text_len,
+ r->text_buf, r->text_buf_size, line_count)) {
+ return -ENOENT;
+ }
+
+ /* Ensure the record is still finalized and has the same @seq. */
+ return desc_read_finalized_seq(desc_ring, id, seq, &desc);
+}
+
+/* Get the sequence number of the tail descriptor. */
+u64 prb_first_seq(struct printk_ringbuffer *rb)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ enum desc_state d_state;
+ struct prb_desc desc;
+ unsigned long id;
+ u64 seq;
+
+ for (;;) {
+ id = atomic_long_read(&rb->desc_ring.tail_id); /* LMM(prb_first_seq:A) */
+
+ d_state = desc_read(desc_ring, id, &desc, &seq, NULL); /* LMM(prb_first_seq:B) */
+
+ /*
+ * This loop will not be infinite because the tail is
+ * _always_ in the finalized or reusable state.
+ */
+ if (d_state == desc_finalized || d_state == desc_reusable)
+ break;
+
+ /*
+ * Guarantee the last state load from desc_read() is before
+ * reloading @tail_id in order to see a new tail in the case
+ * that the descriptor has been recycled. This pairs with
+ * desc_reserve:D.
+ *
+ * Memory barrier involvement:
+ *
+ * If prb_first_seq:B reads from desc_reserve:F, then
+ * prb_first_seq:A reads from desc_push_tail:B.
+ *
+ * Relies on:
+ *
+ * MB from desc_push_tail:B to desc_reserve:F
+ * matching
+ * RMB prb_first_seq:B to prb_first_seq:A
+ */
+ smp_rmb(); /* LMM(prb_first_seq:C) */
+ }
+
+ return seq;
+}
+
+/**
+ * prb_next_reserve_seq() - Get the sequence number after the most recently
+ * reserved record.
+ *
+ * @rb: The ringbuffer to get the sequence number from.
+ *
+ * This is the public function available to readers to see what sequence
+ * number will be assigned to the next reserved record.
+ *
+ * Note that depending on the situation, this value can be equal to or
+ * higher than the sequence number returned by prb_next_seq().
+ *
+ * Context: Any context.
+ * Return: The sequence number that will be assigned to the next record
+ * reserved.
+ */
+u64 prb_next_reserve_seq(struct printk_ringbuffer *rb)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ unsigned long last_finalized_id;
+ atomic_long_t *state_var;
+ u64 last_finalized_seq;
+ unsigned long head_id;
+ struct prb_desc desc;
+ unsigned long diff;
+ struct prb_desc *d;
+ int err;
+
+ /*
+ * It may not be possible to read a sequence number for @head_id.
+ * So the ID of @last_finailzed_seq is used to calculate what the
+ * sequence number of @head_id will be.
+ */
+
+try_again:
+ last_finalized_seq = desc_last_finalized_seq(rb);
+
+ /*
+ * @head_id is loaded after @last_finalized_seq to ensure that
+ * it points to the record with @last_finalized_seq or newer.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_last_finalized_seq:A reads from
+ * desc_update_last_finalized:A, then
+ * prb_next_reserve_seq:A reads from desc_reserve:D.
+ *
+ * Relies on:
+ *
+ * RELEASE from desc_reserve:D to desc_update_last_finalized:A
+ * matching
+ * ACQUIRE from desc_last_finalized_seq:A to prb_next_reserve_seq:A
+ *
+ * Note: desc_reserve:D and desc_update_last_finalized:A can be
+ * different CPUs. However, the desc_update_last_finalized:A CPU
+ * (which performs the release) must have previously seen
+ * desc_read:C, which implies desc_reserve:D can be seen.
+ */
+ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_next_reserve_seq:A) */
+
+ d = to_desc(desc_ring, last_finalized_seq);
+ state_var = &d->state_var;
+
+ /* Extract the ID, used to specify the descriptor to read. */
+ last_finalized_id = DESC_ID(atomic_long_read(state_var));
+
+ /* Ensure @last_finalized_id is correct. */
+ err = desc_read_finalized_seq(desc_ring, last_finalized_id, last_finalized_seq, &desc);
+
+ if (err == -EINVAL) {
+ if (last_finalized_seq == 0) {
+ /*
+ * No record has been finalized or even reserved yet.
+ *
+ * The @head_id is initialized such that the first
+ * increment will yield the first record (seq=0).
+ * Handle it separately to avoid a negative @diff
+ * below.
+ */
+ if (head_id == DESC0_ID(desc_ring->count_bits))
+ return 0;
+
+ /*
+ * One or more descriptors are already reserved. Use
+ * the descriptor ID of the first one (@seq=0) for
+ * the @diff below.
+ */
+ last_finalized_id = DESC0_ID(desc_ring->count_bits) + 1;
+ } else {
+ /* Record must have been overwritten. Try again. */
+ goto try_again;
+ }
+ }
+
+ /* Diff of known descriptor IDs to compute related sequence numbers. */
+ diff = head_id - last_finalized_id;
+
+ /*
+ * @head_id points to the most recently reserved record, but this
+ * function returns the sequence number that will be assigned to the
+ * next (not yet reserved) record. Thus +1 is needed.
+ */
+ return (last_finalized_seq + diff + 1);
+}
+
+/*
+ * Non-blocking read of a record.
+ *
+ * On success @seq is updated to the record that was read and (if provided)
+ * @r and @line_count will contain the read/calculated data.
+ *
+ * On failure @seq is updated to a record that is not yet available to the
+ * reader, but it will be the next record available to the reader.
+ *
+ * Note: When the current CPU is in panic, this function will skip over any
+ * non-existent/non-finalized records in order to allow the panic CPU
+ * to print any and all records that have been finalized.
+ */
+static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
+ struct printk_record *r, unsigned int *line_count)
+{
+ u64 tail_seq;
+ int err;
+
+ while ((err = prb_read(rb, *seq, r, line_count))) {
+ tail_seq = prb_first_seq(rb);
+
+ if (*seq < tail_seq) {
+ /*
+ * Behind the tail. Catch up and try again. This
+ * can happen for -ENOENT and -EINVAL cases.
+ */
+ *seq = tail_seq;
+
+ } else if (err == -ENOENT) {
+ /* Record exists, but the data was lost. Skip. */
+ (*seq)++;
+
+ } else {
+ /*
+ * Non-existent/non-finalized record. Must stop.
+ *
+ * For panic situations it cannot be expected that
+ * non-finalized records will become finalized. But
+ * there may be other finalized records beyond that
+ * need to be printed for a panic situation. If this
+ * is the panic CPU, skip this
+ * non-existent/non-finalized record unless it is
+ * at or beyond the head, in which case it is not
+ * possible to continue.
+ *
+ * Note that new messages printed on panic CPU are
+ * finalized when we are here. The only exception
+ * might be the last message without trailing newline.
+ * But it would have the sequence number returned
+ * by "prb_next_reserve_seq() - 1".
+ */
+ if (this_cpu_in_panic() && ((*seq + 1) < prb_next_reserve_seq(rb)))
+ (*seq)++;
+ else
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * prb_read_valid() - Non-blocking read of a requested record or (if gone)
+ * the next available record.
+ *
+ * @rb: The ringbuffer to read from.
+ * @seq: The sequence number of the record to read.
+ * @r: A record data buffer to store the read record to.
+ *
+ * This is the public function available to readers to read a record.
+ *
+ * The reader provides the @info and @text_buf buffers of @r to be
+ * filled in. Any of the buffer pointers can be set to NULL if the reader
+ * is not interested in that data. To ensure proper initialization of @r,
+ * prb_rec_init_rd() should be used.
+ *
+ * Context: Any context.
+ * Return: true if a record was read, otherwise false.
+ *
+ * On success, the reader must check r->info.seq to see which record was
+ * actually read. This allows the reader to detect dropped records.
+ *
+ * Failure means @seq refers to a record not yet available to the reader.
+ */
+bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
+ struct printk_record *r)
+{
+ return _prb_read_valid(rb, &seq, r, NULL);
+}
+
+/**
+ * prb_read_valid_info() - Non-blocking read of meta data for a requested
+ * record or (if gone) the next available record.
+ *
+ * @rb: The ringbuffer to read from.
+ * @seq: The sequence number of the record to read.
+ * @info: A buffer to store the read record meta data to.
+ * @line_count: A buffer to store the number of lines in the record text.
+ *
+ * This is the public function available to readers to read only the
+ * meta data of a record.
+ *
+ * The reader provides the @info, @line_count buffers to be filled in.
+ * Either of the buffer pointers can be set to NULL if the reader is not
+ * interested in that data.
+ *
+ * Context: Any context.
+ * Return: true if a record's meta data was read, otherwise false.
+ *
+ * On success, the reader must check info->seq to see which record meta data
+ * was actually read. This allows the reader to detect dropped records.
+ *
+ * Failure means @seq refers to a record not yet available to the reader.
+ */
+bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq,
+ struct printk_info *info, unsigned int *line_count)
+{
+ struct printk_record r;
+
+ prb_rec_init_rd(&r, info, NULL, 0);
+
+ return _prb_read_valid(rb, &seq, &r, line_count);
+}
+
+/**
+ * prb_first_valid_seq() - Get the sequence number of the oldest available
+ * record.
+ *
+ * @rb: The ringbuffer to get the sequence number from.
+ *
+ * This is the public function available to readers to see what the
+ * first/oldest valid sequence number is.
+ *
+ * This provides readers a starting point to begin iterating the ringbuffer.
+ *
+ * Context: Any context.
+ * Return: The sequence number of the first/oldest record or, if the
+ * ringbuffer is empty, 0 is returned.
+ */
+u64 prb_first_valid_seq(struct printk_ringbuffer *rb)
+{
+ u64 seq = 0;
+
+ if (!_prb_read_valid(rb, &seq, NULL, NULL))
+ return 0;
+
+ return seq;
+}
+
+/**
+ * prb_next_seq() - Get the sequence number after the last available record.
+ *
+ * @rb: The ringbuffer to get the sequence number from.
+ *
+ * This is the public function available to readers to see what the next
+ * newest sequence number available to readers will be.
+ *
+ * This provides readers a sequence number to jump to if all currently
+ * available records should be skipped. It is guaranteed that all records
+ * previous to the returned value have been finalized and are (or were)
+ * available to the reader.
+ *
+ * Context: Any context.
+ * Return: The sequence number of the next newest (not yet available) record
+ * for readers.
+ */
+u64 prb_next_seq(struct printk_ringbuffer *rb)
+{
+ u64 seq;
+
+ seq = desc_last_finalized_seq(rb);
+
+ /*
+ * Begin searching after the last finalized record.
+ *
+ * On 0, the search must begin at 0 because of hack#2
+ * of the bootstrapping phase it is not known if a
+ * record at index 0 exists.
+ */
+ if (seq != 0)
+ seq++;
+
+ /*
+ * The information about the last finalized @seq might be inaccurate.
+ * Search forward to find the current one.
+ */
+ while (_prb_read_valid(rb, &seq, NULL, NULL))
+ seq++;
+
+ return seq;
+}
+
+/**
+ * prb_init() - Initialize a ringbuffer to use provided external buffers.
+ *
+ * @rb: The ringbuffer to initialize.
+ * @text_buf: The data buffer for text data.
+ * @textbits: The size of @text_buf as a power-of-2 value.
+ * @descs: The descriptor buffer for ringbuffer records.
+ * @descbits: The count of @descs items as a power-of-2 value.
+ * @infos: The printk_info buffer for ringbuffer records.
+ *
+ * This is the public function available to writers to setup a ringbuffer
+ * during runtime using provided buffers.
+ *
+ * This must match the initialization of DEFINE_PRINTKRB().
+ *
+ * Context: Any context.
+ */
+void prb_init(struct printk_ringbuffer *rb,
+ char *text_buf, unsigned int textbits,
+ struct prb_desc *descs, unsigned int descbits,
+ struct printk_info *infos)
+{
+ memset(descs, 0, _DESCS_COUNT(descbits) * sizeof(descs[0]));
+ memset(infos, 0, _DESCS_COUNT(descbits) * sizeof(infos[0]));
+
+ rb->desc_ring.count_bits = descbits;
+ rb->desc_ring.descs = descs;
+ rb->desc_ring.infos = infos;
+ atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits));
+ atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits));
+ atomic_long_set(&rb->desc_ring.last_finalized_seq, 0);
+
+ rb->text_data_ring.size_bits = textbits;
+ rb->text_data_ring.data = text_buf;
+ atomic_long_set(&rb->text_data_ring.head_lpos, BLK0_LPOS(textbits));
+ atomic_long_set(&rb->text_data_ring.tail_lpos, BLK0_LPOS(textbits));
+
+ atomic_long_set(&rb->fail, 0);
+
+ atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits));
+ descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = FAILED_LPOS;
+ descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = FAILED_LPOS;
+
+ infos[0].seq = -(u64)_DESCS_COUNT(descbits);
+ infos[_DESCS_COUNT(descbits) - 1].seq = 0;
+}
+
+/**
+ * prb_record_text_space() - Query the full actual used ringbuffer space for
+ * the text data of a reserved entry.
+ *
+ * @e: The successfully reserved entry to query.
+ *
+ * This is the public function available to writers to see how much actual
+ * space is used in the ringbuffer to store the text data of the specified
+ * entry.
+ *
+ * This function is only valid if @e has been successfully reserved using
+ * prb_reserve().
+ *
+ * Context: Any context.
+ * Return: The size in bytes used by the text data of the associated record.
+ */
+unsigned int prb_record_text_space(struct prb_reserved_entry *e)
+{
+ return e->text_space;
+}
diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h
new file mode 100644
index 000000000000..4ef81349d9fb
--- /dev/null
+++ b/kernel/printk/printk_ringbuffer.h
@@ -0,0 +1,437 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _KERNEL_PRINTK_RINGBUFFER_H
+#define _KERNEL_PRINTK_RINGBUFFER_H
+
+#include <linux/atomic.h>
+#include <linux/bits.h>
+#include <linux/dev_printk.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+
+/*
+ * Meta information about each stored message.
+ *
+ * All fields are set by the printk code except for @seq, which is
+ * set by the ringbuffer code.
+ */
+struct printk_info {
+ u64 seq; /* sequence number */
+ u64 ts_nsec; /* timestamp in nanoseconds */
+ u16 text_len; /* length of text message */
+ u8 facility; /* syslog facility */
+ u8 flags:5; /* internal record flags */
+ u8 level:3; /* syslog level */
+ u32 caller_id; /* thread id or processor id */
+
+ struct dev_printk_info dev_info;
+};
+
+/*
+ * A structure providing the buffers, used by writers and readers.
+ *
+ * Writers:
+ * Using prb_rec_init_wr(), a writer sets @text_buf_size before calling
+ * prb_reserve(). On success, prb_reserve() sets @info and @text_buf to
+ * buffers reserved for that writer.
+ *
+ * Readers:
+ * Using prb_rec_init_rd(), a reader sets all fields before calling
+ * prb_read_valid(). Note that the reader provides the @info and @text_buf,
+ * buffers. On success, the struct pointed to by @info will be filled and
+ * the char array pointed to by @text_buf will be filled with text data.
+ */
+struct printk_record {
+ struct printk_info *info;
+ char *text_buf;
+ unsigned int text_buf_size;
+};
+
+/* Specifies the logical position and span of a data block. */
+struct prb_data_blk_lpos {
+ unsigned long begin;
+ unsigned long next;
+};
+
+/*
+ * A descriptor: the complete meta-data for a record.
+ *
+ * @state_var: A bitwise combination of descriptor ID and descriptor state.
+ */
+struct prb_desc {
+ atomic_long_t state_var;
+ struct prb_data_blk_lpos text_blk_lpos;
+};
+
+/* A ringbuffer of "ID + data" elements. */
+struct prb_data_ring {
+ unsigned int size_bits;
+ char *data;
+ atomic_long_t head_lpos;
+ atomic_long_t tail_lpos;
+};
+
+/* A ringbuffer of "struct prb_desc" elements. */
+struct prb_desc_ring {
+ unsigned int count_bits;
+ struct prb_desc *descs;
+ struct printk_info *infos;
+ atomic_long_t head_id;
+ atomic_long_t tail_id;
+ atomic_long_t last_finalized_seq;
+};
+
+/*
+ * The high level structure representing the printk ringbuffer.
+ *
+ * @fail: Count of failed prb_reserve() calls where not even a data-less
+ * record was created.
+ */
+struct printk_ringbuffer {
+ struct prb_desc_ring desc_ring;
+ struct prb_data_ring text_data_ring;
+ atomic_long_t fail;
+};
+
+/*
+ * Used by writers as a reserve/commit handle.
+ *
+ * @rb: Ringbuffer where the entry is reserved.
+ * @irqflags: Saved irq flags to restore on entry commit.
+ * @id: ID of the reserved descriptor.
+ * @text_space: Total occupied buffer space in the text data ring, including
+ * ID, alignment padding, and wrapping data blocks.
+ *
+ * This structure is an opaque handle for writers. Its contents are only
+ * to be used by the ringbuffer implementation.
+ */
+struct prb_reserved_entry {
+ struct printk_ringbuffer *rb;
+ unsigned long irqflags;
+ unsigned long id;
+ unsigned int text_space;
+};
+
+/* The possible responses of a descriptor state-query. */
+enum desc_state {
+ desc_miss = -1, /* ID mismatch (pseudo state) */
+ desc_reserved = 0x0, /* reserved, in use by writer */
+ desc_committed = 0x1, /* committed by writer, could get reopened */
+ desc_finalized = 0x2, /* committed, no further modification allowed */
+ desc_reusable = 0x3, /* free, not yet used by any writer */
+};
+
+#define _DATA_SIZE(sz_bits) (1UL << (sz_bits))
+#define _DESCS_COUNT(ct_bits) (1U << (ct_bits))
+#define DESC_SV_BITS BITS_PER_LONG
+#define DESC_FLAGS_SHIFT (DESC_SV_BITS - 2)
+#define DESC_FLAGS_MASK (3UL << DESC_FLAGS_SHIFT)
+#define DESC_STATE(sv) (3UL & (sv >> DESC_FLAGS_SHIFT))
+#define DESC_SV(id, state) (((unsigned long)state << DESC_FLAGS_SHIFT) | id)
+#define DESC_ID_MASK (~DESC_FLAGS_MASK)
+#define DESC_ID(sv) ((sv) & DESC_ID_MASK)
+
+/*
+ * Special data block logical position values (for fields of
+ * @prb_desc.text_blk_lpos).
+ *
+ * - Bit0 is used to identify if the record has no data block. (Implemented in
+ * the LPOS_DATALESS() macro.)
+ *
+ * - Bit1 specifies the reason for not having a data block.
+ *
+ * These special values could never be real lpos values because of the
+ * meta data and alignment padding of data blocks. (See to_blk_size() for
+ * details.)
+ */
+#define FAILED_LPOS 0x1
+#define EMPTY_LINE_LPOS 0x3
+
+#define FAILED_BLK_LPOS \
+{ \
+ .begin = FAILED_LPOS, \
+ .next = FAILED_LPOS, \
+}
+
+/*
+ * Descriptor Bootstrap
+ *
+ * The descriptor array is minimally initialized to allow immediate usage
+ * by readers and writers. The requirements that the descriptor array
+ * initialization must satisfy:
+ *
+ * Req1
+ * The tail must point to an existing (committed or reusable) descriptor.
+ * This is required by the implementation of prb_first_seq().
+ *
+ * Req2
+ * Readers must see that the ringbuffer is initially empty.
+ *
+ * Req3
+ * The first record reserved by a writer is assigned sequence number 0.
+ *
+ * To satisfy Req1, the tail initially points to a descriptor that is
+ * minimally initialized (having no data block, i.e. data-less with the
+ * data block's lpos @begin and @next values set to FAILED_LPOS).
+ *
+ * To satisfy Req2, the initial tail descriptor is initialized to the
+ * reusable state. Readers recognize reusable descriptors as existing
+ * records, but skip over them.
+ *
+ * To satisfy Req3, the last descriptor in the array is used as the initial
+ * head (and tail) descriptor. This allows the first record reserved by a
+ * writer (head + 1) to be the first descriptor in the array. (Only the first
+ * descriptor in the array could have a valid sequence number of 0.)
+ *
+ * The first time a descriptor is reserved, it is assigned a sequence number
+ * with the value of the array index. A "first time reserved" descriptor can
+ * be recognized because it has a sequence number of 0 but does not have an
+ * index of 0. (Only the first descriptor in the array could have a valid
+ * sequence number of 0.) After the first reservation, all future reservations
+ * (recycling) simply involve incrementing the sequence number by the array
+ * count.
+ *
+ * Hack #1
+ * Only the first descriptor in the array is allowed to have the sequence
+ * number 0. In this case it is not possible to recognize if it is being
+ * reserved the first time (set to index value) or has been reserved
+ * previously (increment by the array count). This is handled by _always_
+ * incrementing the sequence number by the array count when reserving the
+ * first descriptor in the array. In order to satisfy Req3, the sequence
+ * number of the first descriptor in the array is initialized to minus
+ * the array count. Then, upon the first reservation, it is incremented
+ * to 0, thus satisfying Req3.
+ *
+ * Hack #2
+ * prb_first_seq() can be called at any time by readers to retrieve the
+ * sequence number of the tail descriptor. However, due to Req2 and Req3,
+ * initially there are no records to report the sequence number of
+ * (sequence numbers are u64 and there is nothing less than 0). To handle
+ * this, the sequence number of the initial tail descriptor is initialized
+ * to 0. Technically this is incorrect, because there is no record with
+ * sequence number 0 (yet) and the tail descriptor is not the first
+ * descriptor in the array. But it allows prb_read_valid() to correctly
+ * report the existence of a record for _any_ given sequence number at all
+ * times. Bootstrapping is complete when the tail is pushed the first
+ * time, thus finally pointing to the first descriptor reserved by a
+ * writer, which has the assigned sequence number 0.
+ */
+
+/*
+ * Initiating Logical Value Overflows
+ *
+ * Both logical position (lpos) and ID values can be mapped to array indexes
+ * but may experience overflows during the lifetime of the system. To ensure
+ * that printk_ringbuffer can handle the overflows for these types, initial
+ * values are chosen that map to the correct initial array indexes, but will
+ * result in overflows soon.
+ *
+ * BLK0_LPOS
+ * The initial @head_lpos and @tail_lpos for data rings. It is at index
+ * 0 and the lpos value is such that it will overflow on the first wrap.
+ *
+ * DESC0_ID
+ * The initial @head_id and @tail_id for the desc ring. It is at the last
+ * index of the descriptor array (see Req3 above) and the ID value is such
+ * that it will overflow on the second wrap.
+ */
+#define BLK0_LPOS(sz_bits) (-(_DATA_SIZE(sz_bits)))
+#define DESC0_ID(ct_bits) DESC_ID(-(_DESCS_COUNT(ct_bits) + 1))
+#define DESC0_SV(ct_bits) DESC_SV(DESC0_ID(ct_bits), desc_reusable)
+
+/*
+ * Define a ringbuffer with an external text data buffer. The same as
+ * DEFINE_PRINTKRB() but requires specifying an external buffer for the
+ * text data.
+ *
+ * Note: The specified external buffer must be of the size:
+ * 2 ^ (descbits + avgtextbits)
+ */
+#define _DEFINE_PRINTKRB(name, descbits, avgtextbits, text_buf) \
+static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = { \
+ /* the initial head and tail */ \
+ [_DESCS_COUNT(descbits) - 1] = { \
+ /* reusable */ \
+ .state_var = ATOMIC_INIT(DESC0_SV(descbits)), \
+ /* no associated data block */ \
+ .text_blk_lpos = FAILED_BLK_LPOS, \
+ }, \
+}; \
+static struct printk_info _##name##_infos[_DESCS_COUNT(descbits)] = { \
+ /* this will be the first record reserved by a writer */ \
+ [0] = { \
+ /* will be incremented to 0 on the first reservation */ \
+ .seq = -(u64)_DESCS_COUNT(descbits), \
+ }, \
+ /* the initial head and tail */ \
+ [_DESCS_COUNT(descbits) - 1] = { \
+ /* reports the first seq value during the bootstrap phase */ \
+ .seq = 0, \
+ }, \
+}; \
+static struct printk_ringbuffer name = { \
+ .desc_ring = { \
+ .count_bits = descbits, \
+ .descs = &_##name##_descs[0], \
+ .infos = &_##name##_infos[0], \
+ .head_id = ATOMIC_INIT(DESC0_ID(descbits)), \
+ .tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \
+ .last_finalized_seq = ATOMIC_INIT(0), \
+ }, \
+ .text_data_ring = { \
+ .size_bits = (avgtextbits) + (descbits), \
+ .data = text_buf, \
+ .head_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \
+ .tail_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \
+ }, \
+ .fail = ATOMIC_LONG_INIT(0), \
+}
+
+/**
+ * DEFINE_PRINTKRB() - Define a ringbuffer.
+ *
+ * @name: The name of the ringbuffer variable.
+ * @descbits: The number of descriptors as a power-of-2 value.
+ * @avgtextbits: The average text data size per record as a power-of-2 value.
+ *
+ * This is a macro for defining a ringbuffer and all internal structures
+ * such that it is ready for immediate use. See _DEFINE_PRINTKRB() for a
+ * variant where the text data buffer can be specified externally.
+ */
+#define DEFINE_PRINTKRB(name, descbits, avgtextbits) \
+static char _##name##_text[1U << ((avgtextbits) + (descbits))] \
+ __aligned(__alignof__(unsigned long)); \
+_DEFINE_PRINTKRB(name, descbits, avgtextbits, &_##name##_text[0])
+
+/* Writer Interface */
+
+/**
+ * prb_rec_init_wr() - Initialize a buffer for writing records.
+ *
+ * @r: The record to initialize.
+ * @text_buf_size: The needed text buffer size.
+ */
+static inline void prb_rec_init_wr(struct printk_record *r,
+ unsigned int text_buf_size)
+{
+ r->info = NULL;
+ r->text_buf = NULL;
+ r->text_buf_size = text_buf_size;
+}
+
+bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
+ struct printk_record *r);
+bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
+ struct printk_record *r, u32 caller_id, unsigned int max_size);
+void prb_commit(struct prb_reserved_entry *e);
+void prb_final_commit(struct prb_reserved_entry *e);
+
+void prb_init(struct printk_ringbuffer *rb,
+ char *text_buf, unsigned int text_buf_size,
+ struct prb_desc *descs, unsigned int descs_count_bits,
+ struct printk_info *infos);
+unsigned int prb_record_text_space(struct prb_reserved_entry *e);
+
+/* Reader Interface */
+
+/**
+ * prb_rec_init_rd() - Initialize a buffer for reading records.
+ *
+ * @r: The record to initialize.
+ * @info: A buffer to store record meta-data.
+ * @text_buf: A buffer to store text data.
+ * @text_buf_size: The size of @text_buf.
+ *
+ * Initialize all the fields that a reader is interested in. All arguments
+ * (except @r) are optional. Only record data for arguments that are
+ * non-NULL or non-zero will be read.
+ */
+static inline void prb_rec_init_rd(struct printk_record *r,
+ struct printk_info *info,
+ char *text_buf, unsigned int text_buf_size)
+{
+ r->info = info;
+ r->text_buf = text_buf;
+ r->text_buf_size = text_buf_size;
+}
+
+/**
+ * prb_for_each_record() - Iterate over the records of a ringbuffer.
+ *
+ * @from: The sequence number to begin with.
+ * @rb: The ringbuffer to iterate over.
+ * @s: A u64 to store the sequence number on each iteration.
+ * @r: A printk_record to store the record on each iteration.
+ *
+ * This is a macro for conveniently iterating over a ringbuffer.
+ * Note that @s may not be the sequence number of the record on each
+ * iteration. For the sequence number, @r->info->seq should be checked.
+ *
+ * Context: Any context.
+ */
+#define prb_for_each_record(from, rb, s, r) \
+for ((s) = from; prb_read_valid(rb, s, r); (s) = (r)->info->seq + 1)
+
+/**
+ * prb_for_each_info() - Iterate over the meta data of a ringbuffer.
+ *
+ * @from: The sequence number to begin with.
+ * @rb: The ringbuffer to iterate over.
+ * @s: A u64 to store the sequence number on each iteration.
+ * @i: A printk_info to store the record meta data on each iteration.
+ * @lc: An unsigned int to store the text line count of each record.
+ *
+ * This is a macro for conveniently iterating over a ringbuffer.
+ * Note that @s may not be the sequence number of the record on each
+ * iteration. For the sequence number, @r->info->seq should be checked.
+ *
+ * Context: Any context.
+ */
+#define prb_for_each_info(from, rb, s, i, lc) \
+for ((s) = from; prb_read_valid_info(rb, s, i, lc); (s) = (i)->seq + 1)
+
+bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
+ struct printk_record *r);
+bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq,
+ struct printk_info *info, unsigned int *line_count);
+
+u64 prb_first_seq(struct printk_ringbuffer *rb);
+u64 prb_first_valid_seq(struct printk_ringbuffer *rb);
+u64 prb_next_seq(struct printk_ringbuffer *rb);
+u64 prb_next_reserve_seq(struct printk_ringbuffer *rb);
+
+#ifdef CONFIG_64BIT
+
+#define __u64seq_to_ulseq(u64seq) (u64seq)
+#define __ulseq_to_u64seq(rb, ulseq) (ulseq)
+#define ULSEQ_MAX(rb) (-1)
+
+#else /* CONFIG_64BIT */
+
+#define __u64seq_to_ulseq(u64seq) ((u32)u64seq)
+#define ULSEQ_MAX(rb) __u64seq_to_ulseq(prb_first_seq(rb) + 0x80000000UL)
+
+static inline u64 __ulseq_to_u64seq(struct printk_ringbuffer *rb, u32 ulseq)
+{
+ u64 rb_first_seq = prb_first_seq(rb);
+ u64 seq;
+
+ /*
+ * The provided sequence is only the lower 32 bits of the ringbuffer
+ * sequence. It needs to be expanded to 64bit. Get the first sequence
+ * number from the ringbuffer and fold it.
+ *
+ * Having a 32bit representation in the console is sufficient.
+ * If a console ever gets more than 2^31 records behind
+ * the ringbuffer then this is the least of the problems.
+ *
+ * Also the access to the ring buffer is always safe.
+ */
+ seq = rb_first_seq - (s32)((u32)rb_first_seq - ulseq);
+
+ return seq;
+}
+
+#endif /* CONFIG_64BIT */
+
+#endif /* _KERNEL_PRINTK_RINGBUFFER_H */
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 50aeae770434..32a28f563b13 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -4,411 +4,81 @@
*/
#include <linux/preempt.h>
-#include <linux/spinlock.h>
-#include <linux/debug_locks.h>
#include <linux/kdb.h>
#include <linux/smp.h>
#include <linux/cpumask.h>
-#include <linux/irq_work.h>
#include <linux/printk.h>
#include <linux/kprobes.h>
#include "internal.h"
-/*
- * printk() could not take logbuf_lock in NMI context. Instead,
- * it uses an alternative implementation that temporary stores
- * the strings into a per-CPU buffer. The content of the buffer
- * is later flushed into the main ring buffer via IRQ work.
- *
- * The alternative implementation is chosen transparently
- * by examinig current printk() context mask stored in @printk_context
- * per-CPU variable.
- *
- * The implementation allows to flush the strings also from another CPU.
- * There are situations when we want to make sure that all buffers
- * were handled or when IRQs are blocked.
- */
-
-#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \
- sizeof(atomic_t) - \
- sizeof(atomic_t) - \
- sizeof(struct irq_work))
-
-struct printk_safe_seq_buf {
- atomic_t len; /* length of written data */
- atomic_t message_lost;
- struct irq_work work; /* IRQ work that flushes the buffer */
- unsigned char buffer[SAFE_LOG_BUF_LEN];
-};
-
-static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq);
-static DEFINE_PER_CPU(int, printk_context);
-
-#ifdef CONFIG_PRINTK_NMI
-static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
-#endif
+/* Context where printk messages are never suppressed */
+static atomic_t force_con;
-/* Get flushed in a more safe context. */
-static void queue_flush_work(struct printk_safe_seq_buf *s)
+void printk_force_console_enter(void)
{
- if (printk_percpu_data_ready())
- irq_work_queue(&s->work);
+ atomic_inc(&force_con);
}
-/*
- * Add a message to per-CPU context-dependent buffer. NMI and printk-safe
- * have dedicated buffers, because otherwise printk-safe preempted by
- * NMI-printk would have overwritten the NMI messages.
- *
- * The messages are flushed from irq work (or from panic()), possibly,
- * from other CPU, concurrently with printk_safe_log_store(). Should this
- * happen, printk_safe_log_store() will notice the buffer->len mismatch
- * and repeat the write.
- */
-static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s,
- const char *fmt, va_list args)
+void printk_force_console_exit(void)
{
- int add;
- size_t len;
- va_list ap;
-
-again:
- len = atomic_read(&s->len);
-
- /* The trailing '\0' is not counted into len. */
- if (len >= sizeof(s->buffer) - 1) {
- atomic_inc(&s->message_lost);
- queue_flush_work(s);
- return 0;
- }
-
- /*
- * Make sure that all old data have been read before the buffer
- * was reset. This is not needed when we just append data.
- */
- if (!len)
- smp_rmb();
-
- va_copy(ap, args);
- add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, ap);
- va_end(ap);
- if (!add)
- return 0;
-
- /*
- * Do it once again if the buffer has been flushed in the meantime.
- * Note that atomic_cmpxchg() is an implicit memory barrier that
- * makes sure that the data were written before updating s->len.
- */
- if (atomic_cmpxchg(&s->len, len, len + add) != len)
- goto again;
-
- queue_flush_work(s);
- return add;
+ atomic_dec(&force_con);
}
-static inline void printk_safe_flush_line(const char *text, int len)
+bool is_printk_force_console(void)
{
- /*
- * Avoid any console drivers calls from here, because we may be
- * in NMI or printk_safe context (when in panic). The messages
- * must go only into the ring buffer at this stage. Consoles will
- * get explicitly called later when a crashdump is not generated.
- */
- printk_deferred("%.*s", len, text);
+ return atomic_read(&force_con);
}
-/* printk part of the temporary buffer line by line */
-static int printk_safe_flush_buffer(const char *start, size_t len)
-{
- const char *c, *end;
- bool header;
-
- c = start;
- end = start + len;
- header = true;
-
- /* Print line by line. */
- while (c < end) {
- if (*c == '\n') {
- printk_safe_flush_line(start, c - start + 1);
- start = ++c;
- header = true;
- continue;
- }
-
- /* Handle continuous lines or missing new line. */
- if ((c + 1 < end) && printk_get_level(c)) {
- if (header) {
- c = printk_skip_level(c);
- continue;
- }
-
- printk_safe_flush_line(start, c - start);
- start = c++;
- header = true;
- continue;
- }
-
- header = false;
- c++;
- }
-
- /* Check if there was a partial line. Ignore pure header. */
- if (start < end && !header) {
- static const char newline[] = KERN_CONT "\n";
-
- printk_safe_flush_line(start, end - start);
- printk_safe_flush_line(newline, strlen(newline));
- }
+static DEFINE_PER_CPU(int, printk_context);
- return len;
+/* Can be preempted by NMI. */
+void __printk_safe_enter(void)
+{
+ this_cpu_inc(printk_context);
}
-static void report_message_lost(struct printk_safe_seq_buf *s)
+/* Can be preempted by NMI. */
+void __printk_safe_exit(void)
{
- int lost = atomic_xchg(&s->message_lost, 0);
-
- if (lost)
- printk_deferred("Lost %d message(s)!\n", lost);
+ this_cpu_dec(printk_context);
}
-/*
- * Flush data from the associated per-CPU buffer. The function
- * can be called either via IRQ work or independently.
- */
-static void __printk_safe_flush(struct irq_work *work)
+void __printk_deferred_enter(void)
{
- static raw_spinlock_t read_lock =
- __RAW_SPIN_LOCK_INITIALIZER(read_lock);
- struct printk_safe_seq_buf *s =
- container_of(work, struct printk_safe_seq_buf, work);
- unsigned long flags;
- size_t len;
- int i;
-
- /*
- * The lock has two functions. First, one reader has to flush all
- * available message to make the lockless synchronization with
- * writers easier. Second, we do not want to mix messages from
- * different CPUs. This is especially important when printing
- * a backtrace.
- */
- raw_spin_lock_irqsave(&read_lock, flags);
-
- i = 0;
-more:
- len = atomic_read(&s->len);
-
- /*
- * This is just a paranoid check that nobody has manipulated
- * the buffer an unexpected way. If we printed something then
- * @len must only increase. Also it should never overflow the
- * buffer size.
- */
- if ((i && i >= len) || len > sizeof(s->buffer)) {
- const char *msg = "printk_safe_flush: internal error\n";
-
- printk_safe_flush_line(msg, strlen(msg));
- len = 0;
- }
-
- if (!len)
- goto out; /* Someone else has already flushed the buffer. */
-
- /* Make sure that data has been written up to the @len */
- smp_rmb();
- i += printk_safe_flush_buffer(s->buffer + i, len - i);
-
- /*
- * Check that nothing has got added in the meantime and truncate
- * the buffer. Note that atomic_cmpxchg() is an implicit memory
- * barrier that makes sure that the data were copied before
- * updating s->len.
- */
- if (atomic_cmpxchg(&s->len, len, 0) != len)
- goto more;
-
-out:
- report_message_lost(s);
- raw_spin_unlock_irqrestore(&read_lock, flags);
+ cant_migrate();
+ __printk_safe_enter();
}
-/**
- * printk_safe_flush - flush all per-cpu nmi buffers.
- *
- * The buffers are flushed automatically via IRQ work. This function
- * is useful only when someone wants to be sure that all buffers have
- * been flushed at some point.
- */
-void printk_safe_flush(void)
+void __printk_deferred_exit(void)
{
- int cpu;
-
- for_each_possible_cpu(cpu) {
-#ifdef CONFIG_PRINTK_NMI
- __printk_safe_flush(&per_cpu(nmi_print_seq, cpu).work);
-#endif
- __printk_safe_flush(&per_cpu(safe_print_seq, cpu).work);
- }
+ cant_migrate();
+ __printk_safe_exit();
}
-/**
- * printk_safe_flush_on_panic - flush all per-cpu nmi buffers when the system
- * goes down.
- *
- * Similar to printk_safe_flush() but it can be called even in NMI context when
- * the system goes down. It does the best effort to get NMI messages into
- * the main ring buffer.
- *
- * Note that it could try harder when there is only one CPU online.
- */
-void printk_safe_flush_on_panic(void)
+bool is_printk_legacy_deferred(void)
{
/*
- * Make sure that we could access the main ring buffer.
- * Do not risk a double release when more CPUs are up.
+ * The per-CPU variable @printk_context can be read safely in any
+ * context. CPU migration is always disabled when set.
+ *
+ * A context holding the printk_cpu_sync must not spin waiting for
+ * another CPU. For legacy printing, it could be the console_lock
+ * or the port lock.
*/
- if (raw_spin_is_locked(&logbuf_lock)) {
- if (num_online_cpus() > 1)
- return;
-
- debug_locks_off();
- raw_spin_lock_init(&logbuf_lock);
- }
-
- printk_safe_flush();
-}
-
-#ifdef CONFIG_PRINTK_NMI
-/*
- * Safe printk() for NMI context. It uses a per-CPU buffer to
- * store the message. NMIs are not nested, so there is always only
- * one writer running. But the buffer might get flushed from another
- * CPU, so we need to be careful.
- */
-static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
-{
- struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
-
- return printk_safe_log_store(s, fmt, args);
-}
-
-void noinstr printk_nmi_enter(void)
-{
- this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET);
-}
-
-void noinstr printk_nmi_exit(void)
-{
- this_cpu_sub(printk_context, PRINTK_NMI_CONTEXT_OFFSET);
-}
-
-/*
- * Marks a code that might produce many messages in NMI context
- * and the risk of losing them is more critical than eventual
- * reordering.
- *
- * It has effect only when called in NMI context. Then printk()
- * will try to store the messages into the main logbuf directly
- * and use the per-CPU buffers only as a fallback when the lock
- * is not available.
- */
-void printk_nmi_direct_enter(void)
-{
- if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
- this_cpu_or(printk_context, PRINTK_NMI_DIRECT_CONTEXT_MASK);
-}
-
-void printk_nmi_direct_exit(void)
-{
- this_cpu_and(printk_context, ~PRINTK_NMI_DIRECT_CONTEXT_MASK);
+ return (force_legacy_kthread() ||
+ this_cpu_read(printk_context) ||
+ in_nmi() ||
+ is_printk_cpu_sync_owner());
}
-#else
-
-static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
-{
- return 0;
-}
-
-#endif /* CONFIG_PRINTK_NMI */
-
-/*
- * Lock-less printk(), to avoid deadlocks should the printk() recurse
- * into itself. It uses a per-CPU buffer to store the message, just like
- * NMI.
- */
-static __printf(1, 0) int vprintk_safe(const char *fmt, va_list args)
-{
- struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq);
-
- return printk_safe_log_store(s, fmt, args);
-}
-
-/* Can be preempted by NMI. */
-void __printk_safe_enter(void)
-{
- this_cpu_inc(printk_context);
-}
-
-/* Can be preempted by NMI. */
-void __printk_safe_exit(void)
-{
- this_cpu_dec(printk_context);
-}
-
-__printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+asmlinkage int vprintk(const char *fmt, va_list args)
{
#ifdef CONFIG_KGDB_KDB
/* Allow to pass printk() to kdb but avoid a recursion. */
if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0))
return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
#endif
-
- /*
- * Try to use the main logbuf even in NMI. But avoid calling console
- * drivers that might have their own locks.
- */
- if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK) &&
- raw_spin_trylock(&logbuf_lock)) {
- int len;
-
- len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
- raw_spin_unlock(&logbuf_lock);
- defer_console_output();
- return len;
- }
-
- /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */
- if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
- return vprintk_nmi(fmt, args);
-
- /* Use extra buffer to prevent a recursion deadlock in safe mode. */
- if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK)
- return vprintk_safe(fmt, args);
-
- /* No obstacles. */
return vprintk_default(fmt, args);
}
-
-void __init printk_safe_init(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct printk_safe_seq_buf *s;
-
- s = &per_cpu(safe_print_seq, cpu);
- init_irq_work(&s->work, __printk_safe_flush);
-
-#ifdef CONFIG_PRINTK_NMI
- s = &per_cpu(nmi_print_seq, cpu);
- init_irq_work(&s->work, __printk_safe_flush);
-#endif
- }
-
- /* Flush pending messages that did not have scheduled IRQ works. */
- printk_safe_flush();
-}
+EXPORT_SYMBOL(vprintk);
diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c
new file mode 100644
index 000000000000..da77f3f5c1fe
--- /dev/null
+++ b/kernel/printk/sysctl.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * sysctl.c: General linux system control interface
+ */
+
+#include <linux/sysctl.h>
+#include <linux/printk.h>
+#include <linux/capability.h>
+#include <linux/ratelimit.h>
+#include "internal.h"
+
+static const int ten_thousand = 10000;
+
+static int proc_dointvec_minmax_sysadmin(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+
+static const struct ctl_table printk_sysctls[] = {
+ {
+ .procname = "printk",
+ .data = &console_loglevel,
+ .maxlen = 4*sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "printk_ratelimit",
+ .data = &printk_ratelimit_state.interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "printk_ratelimit_burst",
+ .data = &printk_ratelimit_state.burst,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "printk_delay",
+ .data = &printk_delay_msec,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = (void *)&ten_thousand,
+ },
+ {
+ .procname = "printk_devkmsg",
+ .data = devkmsg_log_str,
+ .maxlen = DEVKMSG_STR_MAX_SIZE,
+ .mode = 0644,
+ .proc_handler = devkmsg_sysctl_set_loglvl,
+ },
+ {
+ .procname = "dmesg_restrict",
+ .data = &dmesg_restrict,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax_sysadmin,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "kptr_restrict",
+ .data = &kptr_restrict,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax_sysadmin,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+};
+
+void __init printk_sysctl_init(void)
+{
+ register_sysctl_init("kernel", printk_sysctls);
+}