diff options
Diffstat (limited to 'kernel/trace/trace.c')
| -rw-r--r-- | kernel/trace/trace.c | 9607 |
1 files changed, 7447 insertions, 2160 deletions
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0cd500bffd9b..e575956ef9b5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * ring buffer based function tracer * @@ -12,43 +13,53 @@ * Copyright (C) 2004 Nadia Yvette Chambers */ #include <linux/ring_buffer.h> -#include <generated/utsrelease.h> +#include <linux/utsname.h> #include <linux/stacktrace.h> #include <linux/writeback.h> #include <linux/kallsyms.h> +#include <linux/security.h> #include <linux/seq_file.h> -#include <linux/notifier.h> #include <linux/irqflags.h> +#include <linux/syscalls.h> #include <linux/debugfs.h> +#include <linux/tracefs.h> #include <linux/pagemap.h> #include <linux/hardirq.h> #include <linux/linkage.h> #include <linux/uaccess.h> -#include <linux/kprobes.h> +#include <linux/cleanup.h> +#include <linux/vmalloc.h> #include <linux/ftrace.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/splice.h> #include <linux/kdebug.h> #include <linux/string.h> +#include <linux/mount.h> #include <linux/rwsem.h> #include <linux/slab.h> #include <linux/ctype.h> #include <linux/init.h> +#include <linux/panic_notifier.h> #include <linux/poll.h> #include <linux/nmi.h> #include <linux/fs.h> +#include <linux/trace.h> +#include <linux/sched/clock.h> #include <linux/sched/rt.h> +#include <linux/fsnotify.h> +#include <linux/irq_work.h> +#include <linux/workqueue.h> +#include <linux/sort.h> +#include <linux/io.h> /* vmap_page_range() */ +#include <linux/fs_context.h> + +#include <asm/setup.h> /* COMMAND_LINE_SIZE */ #include "trace.h" #include "trace_output.h" -/* - * On boot up, the ring buffer is set to the minimum size, so that - * we do not waste memory on systems that are not using tracing. - */ -bool ring_buffer_expanded; - +#ifdef CONFIG_FTRACE_STARTUP_TEST /* * We need to change this state when a selftest is running. * A selftest will lurk into the ring-buffer to count the @@ -59,31 +70,43 @@ bool ring_buffer_expanded; static bool __read_mostly tracing_selftest_running; /* - * If a tracer is running, we do not want to run SELFTEST. + * If boot-time tracing including tracers/events via kernel cmdline + * is running, we do not want to run SELFTEST. */ bool __read_mostly tracing_selftest_disabled; -/* For tracers that don't implement custom flags */ -static struct tracer_opt dummy_tracer_opt[] = { - { } -}; - -static struct tracer_flags dummy_tracer_flags = { - .val = 0, - .opts = dummy_tracer_opt -}; - -static int dummy_set_flag(u32 old_flags, u32 bit, int set) +void __init disable_tracing_selftest(const char *reason) { - return 0; + if (!tracing_selftest_disabled) { + tracing_selftest_disabled = true; + pr_info("Ftrace startup test is disabled due to %s\n", reason); + } } +#else +#define tracing_selftest_running 0 +#define tracing_selftest_disabled 0 +#endif + +/* Pipe tracepoints to printk */ +static struct trace_iterator *tracepoint_print_iter; +int tracepoint_printk; +static bool tracepoint_printk_stop_on_boot __initdata; +static bool traceoff_after_boot __initdata; +static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key); + +/* Store tracers and their flags per instance */ +struct tracers { + struct list_head list; + struct tracer *tracer; + struct tracer_flags *flags; +}; /* * To prevent the comm cache from being overwritten when no * tracing is active, only save the comm when a trace event * occurred. */ -static DEFINE_PER_CPU(bool, trace_cmdline_save); +DEFINE_PER_CPU(bool, trace_taskinfo_save); /* * Kill all tracing for good (never come back). @@ -93,17 +116,16 @@ static DEFINE_PER_CPU(bool, trace_cmdline_save); */ static int tracing_disabled = 1; -DEFINE_PER_CPU(int, ftrace_cpu_disabled); - cpumask_var_t __read_mostly tracing_buffer_mask; +#define MAX_TRACER_SIZE 100 /* * ftrace_dump_on_oops - variable to dump ftrace buffer on oops * * If there is an oops (or kernel panic) and the ftrace_dump_on_oops * is set, then ftrace_dump is called. This will output the contents * of the ftrace buffers to the console. This is very useful for - * capturing traces that lead to crashes and outputing it to a + * capturing traces that lead to crashes and outputting it to a * serial console. * * It is default off, but you can enable it with either specifying @@ -111,115 +133,478 @@ cpumask_var_t __read_mostly tracing_buffer_mask; * /proc/sys/kernel/ftrace_dump_on_oops * Set 1 if you want to dump buffers of all CPUs * Set 2 if you want to dump the buffer of the CPU that triggered oops + * Set instance name if you want to dump the specific trace instance + * Multiple instance dump is also supported, and instances are separated + * by commas. */ - -enum ftrace_dump_mode ftrace_dump_on_oops; +/* Set to string format zero to disable by default */ +char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0"; /* When set, tracing will stop when a WARN*() is hit */ -int __disable_trace_on_warning; +static int __disable_trace_on_warning; + +int tracepoint_printk_sysctl(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +static const struct ctl_table trace_sysctl_table[] = { + { + .procname = "ftrace_dump_on_oops", + .data = &ftrace_dump_on_oops, + .maxlen = MAX_TRACER_SIZE, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "traceoff_on_warning", + .data = &__disable_trace_on_warning, + .maxlen = sizeof(__disable_trace_on_warning), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tracepoint_printk", + .data = &tracepoint_printk, + .maxlen = sizeof(tracepoint_printk), + .mode = 0644, + .proc_handler = tracepoint_printk_sysctl, + }, +}; + +static int __init init_trace_sysctls(void) +{ + register_sysctl_init("kernel", trace_sysctl_table); + return 0; +} +subsys_initcall(init_trace_sysctls); + +#ifdef CONFIG_TRACE_EVAL_MAP_FILE +/* Map of enums to their values, for "eval_map" file */ +struct trace_eval_map_head { + struct module *mod; + unsigned long length; +}; -static int tracing_set_tracer(const char *buf); +union trace_eval_map_item; + +struct trace_eval_map_tail { + /* + * "end" is first and points to NULL as it must be different + * than "mod" or "eval_string" + */ + union trace_eval_map_item *next; + const char *end; /* points to NULL */ +}; + +static DEFINE_MUTEX(trace_eval_mutex); + +/* + * The trace_eval_maps are saved in an array with two extra elements, + * one at the beginning, and one at the end. The beginning item contains + * the count of the saved maps (head.length), and the module they + * belong to if not built in (head.mod). The ending item contains a + * pointer to the next array of saved eval_map items. + */ +union trace_eval_map_item { + struct trace_eval_map map; + struct trace_eval_map_head head; + struct trace_eval_map_tail tail; +}; + +static union trace_eval_map_item *trace_eval_maps; +#endif /* CONFIG_TRACE_EVAL_MAP_FILE */ + +int tracing_set_tracer(struct trace_array *tr, const char *buf); +static void ftrace_trace_userstack(struct trace_array *tr, + struct trace_buffer *buffer, + unsigned int trace_ctx); -#define MAX_TRACER_SIZE 100 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; static bool allocate_snapshot; +static bool snapshot_at_boot; + +static char boot_instance_info[COMMAND_LINE_SIZE] __initdata; +static int boot_instance_index; + +static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata; +static int boot_snapshot_index; static int __init set_cmdline_ftrace(char *str) { - strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); + strscpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ - ring_buffer_expanded = true; + trace_set_ring_buffer_expanded(NULL); return 1; } __setup("ftrace=", set_cmdline_ftrace); +int ftrace_dump_on_oops_enabled(void) +{ + if (!strcmp("0", ftrace_dump_on_oops)) + return 0; + else + return 1; +} + static int __init set_ftrace_dump_on_oops(char *str) { - if (*str++ != '=' || !*str) { - ftrace_dump_on_oops = DUMP_ALL; + if (!*str) { + strscpy(ftrace_dump_on_oops, "1", MAX_TRACER_SIZE); return 1; } - if (!strcmp("orig_cpu", str)) { - ftrace_dump_on_oops = DUMP_ORIG; - return 1; - } + if (*str == ',') { + strscpy(ftrace_dump_on_oops, "1", MAX_TRACER_SIZE); + strscpy(ftrace_dump_on_oops + 1, str, MAX_TRACER_SIZE - 1); + return 1; + } - return 0; + if (*str++ == '=') { + strscpy(ftrace_dump_on_oops, str, MAX_TRACER_SIZE); + return 1; + } + + return 0; } __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); static int __init stop_trace_on_warning(char *str) { - __disable_trace_on_warning = 1; + if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) + __disable_trace_on_warning = 1; return 1; } -__setup("traceoff_on_warning=", stop_trace_on_warning); +__setup("traceoff_on_warning", stop_trace_on_warning); static int __init boot_alloc_snapshot(char *str) { - allocate_snapshot = true; - /* We also need the main ring buffer expanded */ - ring_buffer_expanded = true; + char *slot = boot_snapshot_info + boot_snapshot_index; + int left = sizeof(boot_snapshot_info) - boot_snapshot_index; + int ret; + + if (str[0] == '=') { + str++; + if (strlen(str) >= left) + return -1; + + ret = snprintf(slot, left, "%s\t", str); + boot_snapshot_index += ret; + } else { + allocate_snapshot = true; + /* We also need the main ring buffer expanded */ + trace_set_ring_buffer_expanded(NULL); + } return 1; } __setup("alloc_snapshot", boot_alloc_snapshot); +static int __init boot_snapshot(char *str) +{ + snapshot_at_boot = true; + boot_alloc_snapshot(str); + return 1; +} +__setup("ftrace_boot_snapshot", boot_snapshot); + + +static int __init boot_instance(char *str) +{ + char *slot = boot_instance_info + boot_instance_index; + int left = sizeof(boot_instance_info) - boot_instance_index; + int ret; + + if (strlen(str) >= left) + return -1; + + ret = snprintf(slot, left, "%s\t", str); + boot_instance_index += ret; + + return 1; +} +__setup("trace_instance=", boot_instance); + + static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; -static char *trace_boot_options __initdata; static int __init set_trace_boot_options(char *str) { - strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); - trace_boot_options = trace_boot_options_buf; - return 0; + strscpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); + return 1; } __setup("trace_options=", set_trace_boot_options); +static char trace_boot_clock_buf[MAX_TRACER_SIZE] __initdata; +static char *trace_boot_clock __initdata; + +static int __init set_trace_boot_clock(char *str) +{ + strscpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); + trace_boot_clock = trace_boot_clock_buf; + return 1; +} +__setup("trace_clock=", set_trace_boot_clock); + +static int __init set_tracepoint_printk(char *str) +{ + /* Ignore the "tp_printk_stop_on_boot" param */ + if (*str == '_') + return 0; + + if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) + tracepoint_printk = 1; + return 1; +} +__setup("tp_printk", set_tracepoint_printk); + +static int __init set_tracepoint_printk_stop(char *str) +{ + tracepoint_printk_stop_on_boot = true; + return 1; +} +__setup("tp_printk_stop_on_boot", set_tracepoint_printk_stop); + +static int __init set_traceoff_after_boot(char *str) +{ + traceoff_after_boot = true; + return 1; +} +__setup("traceoff_after_boot", set_traceoff_after_boot); -unsigned long long ns2usecs(cycle_t nsec) +unsigned long long ns2usecs(u64 nsec) { nsec += 500; do_div(nsec, 1000); return nsec; } +static void +trace_process_export(struct trace_export *export, + struct ring_buffer_event *event, int flag) +{ + struct trace_entry *entry; + unsigned int size = 0; + + if (export->flags & flag) { + entry = ring_buffer_event_data(event); + size = ring_buffer_event_length(event); + export->write(export, entry, size); + } +} + +static DEFINE_MUTEX(ftrace_export_lock); + +static struct trace_export __rcu *ftrace_exports_list __read_mostly; + +static DEFINE_STATIC_KEY_FALSE(trace_function_exports_enabled); +static DEFINE_STATIC_KEY_FALSE(trace_event_exports_enabled); +static DEFINE_STATIC_KEY_FALSE(trace_marker_exports_enabled); + +static inline void ftrace_exports_enable(struct trace_export *export) +{ + if (export->flags & TRACE_EXPORT_FUNCTION) + static_branch_inc(&trace_function_exports_enabled); + + if (export->flags & TRACE_EXPORT_EVENT) + static_branch_inc(&trace_event_exports_enabled); + + if (export->flags & TRACE_EXPORT_MARKER) + static_branch_inc(&trace_marker_exports_enabled); +} + +static inline void ftrace_exports_disable(struct trace_export *export) +{ + if (export->flags & TRACE_EXPORT_FUNCTION) + static_branch_dec(&trace_function_exports_enabled); + + if (export->flags & TRACE_EXPORT_EVENT) + static_branch_dec(&trace_event_exports_enabled); + + if (export->flags & TRACE_EXPORT_MARKER) + static_branch_dec(&trace_marker_exports_enabled); +} + +static void ftrace_exports(struct ring_buffer_event *event, int flag) +{ + struct trace_export *export; + + guard(preempt_notrace)(); + + export = rcu_dereference_raw_check(ftrace_exports_list); + while (export) { + trace_process_export(export, event, flag); + export = rcu_dereference_raw_check(export->next); + } +} + +static inline void +add_trace_export(struct trace_export **list, struct trace_export *export) +{ + rcu_assign_pointer(export->next, *list); + /* + * We are entering export into the list but another + * CPU might be walking that list. We need to make sure + * the export->next pointer is valid before another CPU sees + * the export pointer included into the list. + */ + rcu_assign_pointer(*list, export); +} + +static inline int +rm_trace_export(struct trace_export **list, struct trace_export *export) +{ + struct trace_export **p; + + for (p = list; *p != NULL; p = &(*p)->next) + if (*p == export) + break; + + if (*p != export) + return -1; + + rcu_assign_pointer(*p, (*p)->next); + + return 0; +} + +static inline void +add_ftrace_export(struct trace_export **list, struct trace_export *export) +{ + ftrace_exports_enable(export); + + add_trace_export(list, export); +} + +static inline int +rm_ftrace_export(struct trace_export **list, struct trace_export *export) +{ + int ret; + + ret = rm_trace_export(list, export); + ftrace_exports_disable(export); + + return ret; +} + +int register_ftrace_export(struct trace_export *export) +{ + if (WARN_ON_ONCE(!export->write)) + return -1; + + guard(mutex)(&ftrace_export_lock); + + add_ftrace_export(&ftrace_exports_list, export); + + return 0; +} +EXPORT_SYMBOL_GPL(register_ftrace_export); + +int unregister_ftrace_export(struct trace_export *export) +{ + guard(mutex)(&ftrace_export_lock); + return rm_ftrace_export(&ftrace_exports_list, export); +} +EXPORT_SYMBOL_GPL(unregister_ftrace_export); + +/* trace_flags holds trace_options default values */ +#define TRACE_DEFAULT_FLAGS \ + (FUNCTION_DEFAULT_FLAGS | FPROFILE_DEFAULT_FLAGS | \ + TRACE_ITER(PRINT_PARENT) | TRACE_ITER(PRINTK) | \ + TRACE_ITER(ANNOTATE) | TRACE_ITER(CONTEXT_INFO) | \ + TRACE_ITER(RECORD_CMD) | TRACE_ITER(OVERWRITE) | \ + TRACE_ITER(IRQ_INFO) | TRACE_ITER(MARKERS) | \ + TRACE_ITER(HASH_PTR) | TRACE_ITER(TRACE_PRINTK) | \ + TRACE_ITER(COPY_MARKER)) + +/* trace_options that are only supported by global_trace */ +#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER(PRINTK) | \ + TRACE_ITER(PRINTK_MSGONLY) | TRACE_ITER(RECORD_CMD) | \ + TRACE_ITER(PROF_TEXT_OFFSET) | FPROFILE_DEFAULT_FLAGS) + +/* trace_flags that are default zero for instances */ +#define ZEROED_TRACE_FLAGS \ + (TRACE_ITER(EVENT_FORK) | TRACE_ITER(FUNC_FORK) | TRACE_ITER(TRACE_PRINTK) | \ + TRACE_ITER(COPY_MARKER)) + /* - * The global_trace is the descriptor that holds the tracing - * buffers for the live tracing. For each CPU, it contains - * a link list of pages that will store trace entries. The - * page descriptor of the pages in the memory is used to hold - * the link list by linking the lru item in the page descriptor - * to each of the pages in the buffer per CPU. - * - * For each active CPU there is a data field that holds the - * pages for the buffer for that CPU. Each CPU has the same number - * of pages allocated for its buffer. + * The global_trace is the descriptor that holds the top-level tracing + * buffers for the live tracing. */ -static struct trace_array global_trace; +static struct trace_array global_trace = { + .trace_flags = TRACE_DEFAULT_FLAGS, +}; + +static struct trace_array *printk_trace = &global_trace; + +/* List of trace_arrays interested in the top level trace_marker */ +static LIST_HEAD(marker_copies); + +static __always_inline bool printk_binsafe(struct trace_array *tr) +{ + /* + * The binary format of traceprintk can cause a crash if used + * by a buffer from another boot. Force the use of the + * non binary version of trace_printk if the trace_printk + * buffer is a boot mapped ring buffer. + */ + return !(tr->flags & TRACE_ARRAY_FL_BOOT); +} + +static void update_printk_trace(struct trace_array *tr) +{ + if (printk_trace == tr) + return; + + printk_trace->trace_flags &= ~TRACE_ITER(TRACE_PRINTK); + printk_trace = tr; + tr->trace_flags |= TRACE_ITER(TRACE_PRINTK); +} + +/* Returns true if the status of tr changed */ +static bool update_marker_trace(struct trace_array *tr, int enabled) +{ + lockdep_assert_held(&event_mutex); + + if (enabled) { + if (!list_empty(&tr->marker_list)) + return false; + + list_add_rcu(&tr->marker_list, &marker_copies); + tr->trace_flags |= TRACE_ITER(COPY_MARKER); + return true; + } + + if (list_empty(&tr->marker_list)) + return false; + + list_del_init(&tr->marker_list); + tr->trace_flags &= ~TRACE_ITER(COPY_MARKER); + return true; +} + +void trace_set_ring_buffer_expanded(struct trace_array *tr) +{ + if (!tr) + tr = &global_trace; + tr->ring_buffer_expanded = true; +} LIST_HEAD(ftrace_trace_arrays); int trace_array_get(struct trace_array *this_tr) { struct trace_array *tr; - int ret = -ENODEV; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (tr == this_tr) { tr->ref++; - ret = 0; - break; + return 0; } } - mutex_unlock(&trace_types_lock); - return ret; + return -ENODEV; } static void __trace_array_put(struct trace_array *this_tr) @@ -228,37 +613,305 @@ static void __trace_array_put(struct trace_array *this_tr) this_tr->ref--; } +/** + * trace_array_put - Decrement the reference counter for this trace array. + * @this_tr : pointer to the trace array + * + * NOTE: Use this when we no longer need the trace array returned by + * trace_array_get_by_name(). This ensures the trace array can be later + * destroyed. + * + */ void trace_array_put(struct trace_array *this_tr) { - mutex_lock(&trace_types_lock); + if (!this_tr) + return; + + guard(mutex)(&trace_types_lock); __trace_array_put(this_tr); - mutex_unlock(&trace_types_lock); +} +EXPORT_SYMBOL_GPL(trace_array_put); + +int tracing_check_open_get_tr(struct trace_array *tr) +{ + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + if (tracing_disabled) + return -ENODEV; + + if (tr && trace_array_get(tr) < 0) + return -ENODEV; + + return 0; +} + +/** + * trace_find_filtered_pid - check if a pid exists in a filtered_pid list + * @filtered_pids: The list of pids to check + * @search_pid: The PID to find in @filtered_pids + * + * Returns true if @search_pid is found in @filtered_pids, and false otherwise. + */ +bool +trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) +{ + return trace_pid_list_is_set(filtered_pids, search_pid); +} + +/** + * trace_ignore_this_task - should a task be ignored for tracing + * @filtered_pids: The list of pids to check + * @filtered_no_pids: The list of pids not to be traced + * @task: The task that should be ignored if not filtered + * + * Checks if @task should be traced or not from @filtered_pids. + * Returns true if @task should *NOT* be traced. + * Returns false if @task should be traced. + */ +bool +trace_ignore_this_task(struct trace_pid_list *filtered_pids, + struct trace_pid_list *filtered_no_pids, + struct task_struct *task) +{ + /* + * If filtered_no_pids is not empty, and the task's pid is listed + * in filtered_no_pids, then return true. + * Otherwise, if filtered_pids is empty, that means we can + * trace all tasks. If it has content, then only trace pids + * within filtered_pids. + */ + + return (filtered_pids && + !trace_find_filtered_pid(filtered_pids, task->pid)) || + (filtered_no_pids && + trace_find_filtered_pid(filtered_no_pids, task->pid)); +} + +/** + * trace_filter_add_remove_task - Add or remove a task from a pid_list + * @pid_list: The list to modify + * @self: The current task for fork or NULL for exit + * @task: The task to add or remove + * + * If adding a task, if @self is defined, the task is only added if @self + * is also included in @pid_list. This happens on fork and tasks should + * only be added when the parent is listed. If @self is NULL, then the + * @task pid will be removed from the list, which would happen on exit + * of a task. + */ +void trace_filter_add_remove_task(struct trace_pid_list *pid_list, + struct task_struct *self, + struct task_struct *task) +{ + if (!pid_list) + return; + + /* For forks, we only add if the forking task is listed */ + if (self) { + if (!trace_find_filtered_pid(pid_list, self->pid)) + return; + } + + /* "self" is set for forks, and NULL for exits */ + if (self) + trace_pid_list_set(pid_list, task->pid); + else + trace_pid_list_clear(pid_list, task->pid); +} + +/** + * trace_pid_next - Used for seq_file to get to the next pid of a pid_list + * @pid_list: The pid list to show + * @v: The last pid that was shown (+1 the actual pid to let zero be displayed) + * @pos: The position of the file + * + * This is used by the seq_file "next" operation to iterate the pids + * listed in a trace_pid_list structure. + * + * Returns the pid+1 as we want to display pid of zero, but NULL would + * stop the iteration. + */ +void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) +{ + long pid = (unsigned long)v; + unsigned int next; + + (*pos)++; + + /* pid already is +1 of the actual previous bit */ + if (trace_pid_list_next(pid_list, pid, &next) < 0) + return NULL; + + pid = next; + + /* Return pid + 1 to allow zero to be represented */ + return (void *)(pid + 1); +} + +/** + * trace_pid_start - Used for seq_file to start reading pid lists + * @pid_list: The pid list to show + * @pos: The position of the file + * + * This is used by seq_file "start" operation to start the iteration + * of listing pids. + * + * Returns the pid+1 as we want to display pid of zero, but NULL would + * stop the iteration. + */ +void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos) +{ + unsigned long pid; + unsigned int first; + loff_t l = 0; + + if (trace_pid_list_first(pid_list, &first) < 0) + return NULL; + + pid = first; + + /* Return pid + 1 so that zero can be the exit value */ + for (pid++; pid && l < *pos; + pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l)) + ; + return (void *)pid; } -int filter_current_check_discard(struct ring_buffer *buffer, - struct ftrace_event_call *call, void *rec, - struct ring_buffer_event *event) +/** + * trace_pid_show - show the current pid in seq_file processing + * @m: The seq_file structure to write into + * @v: A void pointer of the pid (+1) value to display + * + * Can be directly used by seq_file operations to display the current + * pid value. + */ +int trace_pid_show(struct seq_file *m, void *v) { - return filter_check_discard(call, rec, buffer, event); + unsigned long pid = (unsigned long)v - 1; + + seq_printf(m, "%lu\n", pid); + return 0; +} + +/* 128 should be much more than enough */ +#define PID_BUF_SIZE 127 + +int trace_pid_write(struct trace_pid_list *filtered_pids, + struct trace_pid_list **new_pid_list, + const char __user *ubuf, size_t cnt) +{ + struct trace_pid_list *pid_list; + struct trace_parser parser; + unsigned long val; + int nr_pids = 0; + ssize_t read = 0; + ssize_t ret; + loff_t pos; + pid_t pid; + + if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1)) + return -ENOMEM; + + /* + * Always recreate a new array. The write is an all or nothing + * operation. Always create a new array when adding new pids by + * the user. If the operation fails, then the current list is + * not modified. + */ + pid_list = trace_pid_list_alloc(); + if (!pid_list) { + trace_parser_put(&parser); + return -ENOMEM; + } + + if (filtered_pids) { + /* copy the current bits to the new max */ + ret = trace_pid_list_first(filtered_pids, &pid); + while (!ret) { + ret = trace_pid_list_set(pid_list, pid); + if (ret < 0) + goto out; + + ret = trace_pid_list_next(filtered_pids, pid + 1, &pid); + nr_pids++; + } + } + + ret = 0; + while (cnt > 0) { + + pos = 0; + + ret = trace_get_user(&parser, ubuf, cnt, &pos); + if (ret < 0) + break; + + read += ret; + ubuf += ret; + cnt -= ret; + + if (!trace_parser_loaded(&parser)) + break; + + ret = -EINVAL; + if (kstrtoul(parser.buffer, 0, &val)) + break; + + pid = (pid_t)val; + + if (trace_pid_list_set(pid_list, pid) < 0) { + ret = -1; + break; + } + nr_pids++; + + trace_parser_clear(&parser); + ret = 0; + } + out: + trace_parser_put(&parser); + + if (ret < 0) { + trace_pid_list_free(pid_list); + return ret; + } + + if (!nr_pids) { + /* Cleared the list of pids */ + trace_pid_list_free(pid_list); + pid_list = NULL; + } + + *new_pid_list = pid_list; + + return read; } -EXPORT_SYMBOL_GPL(filter_current_check_discard); -cycle_t ftrace_now(int cpu) +static u64 buffer_ftrace_now(struct array_buffer *buf, int cpu) { u64 ts; /* Early boot up does not have a buffer yet */ - if (!global_trace.trace_buffer.buffer) + if (!buf->buffer) return trace_clock_local(); - ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu); - ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts); + ts = ring_buffer_time_stamp(buf->buffer); + ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts); return ts; } +u64 ftrace_now(int cpu) +{ + return buffer_ftrace_now(&global_trace.array_buffer, cpu); +} + /** - * tracing_is_enabled - Show if global_trace has been disabled + * tracing_is_enabled - Show if global_trace has been enabled * * Shows if the global trace has been enabled or not. It uses the * mirror flag "buffer_disabled" to be used in fast paths such as for @@ -273,7 +926,6 @@ int tracing_is_enabled(void) * return the mirror variable of the state of the ring buffer. * It's a little racy, but we don't really care. */ - smp_rmb(); return !global_trace.buffer_disabled; } @@ -309,7 +961,7 @@ DEFINE_MUTEX(trace_types_lock); * The content of events may become garbage if we allow other process consumes * these events concurrently: * A) the page of the consumed events may become a normal page - * (not reader page) in ring buffer, and this page will be rewrited + * (not reader page) in ring buffer, and this page will be rewritten * by events producer. * B) The page of the consumed events may become a page for splice_read, * and this page will be returned to system. @@ -381,16 +1033,60 @@ static inline void trace_access_lock_init(void) #endif -/* trace_flags holds trace_options default values */ -unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | - TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | - TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | - TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION; +#ifdef CONFIG_STACKTRACE +static void __ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, + unsigned int trace_ctx, + int skip, struct pt_regs *regs); +static inline void ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, + unsigned int trace_ctx, + int skip, struct pt_regs *regs); -static void tracer_tracing_on(struct trace_array *tr) +#else +static inline void __ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, + unsigned int trace_ctx, + int skip, struct pt_regs *regs) +{ +} +static inline void ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, + unsigned long trace_ctx, + int skip, struct pt_regs *regs) { - if (tr->trace_buffer.buffer) - ring_buffer_record_on(tr->trace_buffer.buffer); +} + +#endif + +static __always_inline void +trace_event_setup(struct ring_buffer_event *event, + int type, unsigned int trace_ctx) +{ + struct trace_entry *ent = ring_buffer_event_data(event); + + tracing_generic_entry_update(ent, type, trace_ctx); +} + +static __always_inline struct ring_buffer_event * +__trace_buffer_lock_reserve(struct trace_buffer *buffer, + int type, + unsigned long len, + unsigned int trace_ctx) +{ + struct ring_buffer_event *event; + + event = ring_buffer_lock_reserve(buffer, len); + if (event != NULL) + trace_event_setup(event, type, trace_ctx); + + return event; +} + +void tracer_tracing_on(struct trace_array *tr) +{ + if (tr->array_buffer.buffer) + ring_buffer_record_on(tr->array_buffer.buffer); /* * This flag is looked at when buffers haven't been allocated * yet, or by some tracers (like irqsoff), that just want to @@ -400,8 +1096,6 @@ static void tracer_tracing_on(struct trace_array *tr) * important to be fast than accurate. */ tr->buffer_disabled = 0; - /* Make the flag seen by readers */ - smp_wmb(); } /** @@ -416,26 +1110,49 @@ void tracing_on(void) } EXPORT_SYMBOL_GPL(tracing_on); -/** - * __trace_puts - write a constant string into the trace buffer. - * @ip: The address of the caller - * @str: The constant string to write - * @size: The size of the string. - */ -int __trace_puts(unsigned long ip, const char *str, int size) + +static __always_inline void +__buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) +{ + __this_cpu_write(trace_taskinfo_save, true); + + /* If this is the temp buffer, we need to commit fully */ + if (this_cpu_read(trace_buffered_event) == event) { + /* Length is in event->array[0] */ + ring_buffer_write(buffer, event->array[0], &event->array[1]); + /* Release the temp buffer */ + this_cpu_dec(trace_buffered_event_cnt); + /* ring_buffer_unlock_commit() enables preemption */ + preempt_enable_notrace(); + } else + ring_buffer_unlock_commit(buffer); +} + +int __trace_array_puts(struct trace_array *tr, unsigned long ip, + const char *str, int size) { struct ring_buffer_event *event; - struct ring_buffer *buffer; + struct trace_buffer *buffer; struct print_entry *entry; - unsigned long irq_flags; + unsigned int trace_ctx; int alloc; + if (!(tr->trace_flags & TRACE_ITER(PRINTK))) + return 0; + + if (unlikely(tracing_selftest_running && tr == &global_trace)) + return 0; + + if (unlikely(tracing_disabled)) + return 0; + alloc = sizeof(*entry) + size + 2; /* possible \n added */ - local_save_flags(irq_flags); - buffer = global_trace.trace_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, - irq_flags, preempt_count()); + trace_ctx = tracing_gen_ctx(); + buffer = tr->array_buffer.buffer; + guard(ring_buffer_nest)(buffer); + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, + trace_ctx); if (!event) return 0; @@ -452,9 +1169,21 @@ int __trace_puts(unsigned long ip, const char *str, int size) entry->buf[size] = '\0'; __buffer_unlock_commit(buffer, event); - + ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); return size; } +EXPORT_SYMBOL_GPL(__trace_array_puts); + +/** + * __trace_puts - write a constant string into the trace buffer. + * @ip: The address of the caller + * @str: The constant string to write + * @size: The size of the string. + */ +int __trace_puts(unsigned long ip, const char *str, int size) +{ + return __trace_array_puts(printk_trace, ip, str, size); +} EXPORT_SYMBOL_GPL(__trace_puts); /** @@ -464,16 +1193,28 @@ EXPORT_SYMBOL_GPL(__trace_puts); */ int __trace_bputs(unsigned long ip, const char *str) { + struct trace_array *tr = READ_ONCE(printk_trace); struct ring_buffer_event *event; - struct ring_buffer *buffer; + struct trace_buffer *buffer; struct bputs_entry *entry; - unsigned long irq_flags; + unsigned int trace_ctx; int size = sizeof(struct bputs_entry); - local_save_flags(irq_flags); - buffer = global_trace.trace_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, - irq_flags, preempt_count()); + if (!printk_binsafe(tr)) + return __trace_puts(ip, str, strlen(str)); + + if (!(tr->trace_flags & TRACE_ITER(PRINTK))) + return 0; + + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; + + trace_ctx = tracing_gen_ctx(); + buffer = tr->array_buffer.buffer; + + guard(ring_buffer_nest)(buffer); + event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, + trace_ctx); if (!event) return 0; @@ -482,71 +1223,147 @@ int __trace_bputs(unsigned long ip, const char *str) entry->str = str; __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); return 1; } EXPORT_SYMBOL_GPL(__trace_bputs); #ifdef CONFIG_TRACER_SNAPSHOT -/** - * trace_snapshot - take a snapshot of the current buffer. - * - * This causes a swap between the snapshot buffer and the current live - * tracing buffer. You can use this to take snapshots of the live - * trace when some condition is triggered, but continue to trace. - * - * Note, make sure to allocate the snapshot with either - * a tracing_snapshot_alloc(), or by doing it manually - * with: echo 1 > /sys/kernel/debug/tracing/snapshot - * - * If the snapshot buffer is not allocated, it will stop tracing. - * Basically making a permanent snapshot. - */ -void tracing_snapshot(void) +static void tracing_snapshot_instance_cond(struct trace_array *tr, + void *cond_data) { - struct trace_array *tr = &global_trace; struct tracer *tracer = tr->current_trace; unsigned long flags; if (in_nmi()) { - internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); - internal_trace_puts("*** snapshot is being ignored ***\n"); + trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); + trace_array_puts(tr, "*** snapshot is being ignored ***\n"); return; } if (!tr->allocated_snapshot) { - internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n"); - internal_trace_puts("*** stopping trace here! ***\n"); - tracing_off(); + trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n"); + trace_array_puts(tr, "*** stopping trace here! ***\n"); + tracer_tracing_off(tr); return; } /* Note, snapshot can not be used when the tracer uses it */ if (tracer->use_max_tr) { - internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n"); - internal_trace_puts("*** Can not use snapshot (sorry) ***\n"); + trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n"); + trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); + return; + } + + if (tr->mapped) { + trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n"); + trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); return; } local_irq_save(flags); - update_max_tr(tr, current, smp_processor_id()); + update_max_tr(tr, current, smp_processor_id(), cond_data); local_irq_restore(flags); } + +void tracing_snapshot_instance(struct trace_array *tr) +{ + tracing_snapshot_instance_cond(tr, NULL); +} + +/** + * tracing_snapshot - take a snapshot of the current buffer. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + * + * Note, make sure to allocate the snapshot with either + * a tracing_snapshot_alloc(), or by doing it manually + * with: echo 1 > /sys/kernel/tracing/snapshot + * + * If the snapshot buffer is not allocated, it will stop tracing. + * Basically making a permanent snapshot. + */ +void tracing_snapshot(void) +{ + struct trace_array *tr = &global_trace; + + tracing_snapshot_instance(tr); +} EXPORT_SYMBOL_GPL(tracing_snapshot); -static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, - struct trace_buffer *size_buf, int cpu_id); -static void set_buffer_entries(struct trace_buffer *buf, unsigned long val); +/** + * tracing_snapshot_cond - conditionally take a snapshot of the current buffer. + * @tr: The tracing instance to snapshot + * @cond_data: The data to be tested conditionally, and possibly saved + * + * This is the same as tracing_snapshot() except that the snapshot is + * conditional - the snapshot will only happen if the + * cond_snapshot.update() implementation receiving the cond_data + * returns true, which means that the trace array's cond_snapshot + * update() operation used the cond_data to determine whether the + * snapshot should be taken, and if it was, presumably saved it along + * with the snapshot. + */ +void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) +{ + tracing_snapshot_instance_cond(tr, cond_data); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond); + +/** + * tracing_cond_snapshot_data - get the user data associated with a snapshot + * @tr: The tracing instance + * + * When the user enables a conditional snapshot using + * tracing_snapshot_cond_enable(), the user-defined cond_data is saved + * with the snapshot. This accessor is used to retrieve it. + * + * Should not be called from cond_snapshot.update(), since it takes + * the tr->max_lock lock, which the code calling + * cond_snapshot.update() has already done. + * + * Returns the cond_data associated with the trace array's snapshot. + */ +void *tracing_cond_snapshot_data(struct trace_array *tr) +{ + void *cond_data = NULL; + + local_irq_disable(); + arch_spin_lock(&tr->max_lock); + + if (tr->cond_snapshot) + cond_data = tr->cond_snapshot->cond_data; + + arch_spin_unlock(&tr->max_lock); + local_irq_enable(); + + return cond_data; +} +EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); + +static int resize_buffer_duplicate_size(struct array_buffer *trace_buf, + struct array_buffer *size_buf, int cpu_id); +static void set_buffer_entries(struct array_buffer *buf, unsigned long val); -static int alloc_snapshot(struct trace_array *tr) +int tracing_alloc_snapshot_instance(struct trace_array *tr) { + int order; int ret; if (!tr->allocated_snapshot) { + /* Make the snapshot buffer have the same order as main buffer */ + order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); + ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order); + if (ret < 0) + return ret; + /* allocate spare buffer */ ret = resize_buffer_duplicate_size(&tr->max_buffer, - &tr->trace_buffer, RING_BUFFER_ALL_CPUS); + &tr->array_buffer, RING_BUFFER_ALL_CPUS); if (ret < 0) return ret; @@ -556,23 +1373,85 @@ static int alloc_snapshot(struct trace_array *tr) return 0; } -void free_snapshot(struct trace_array *tr) +static void free_snapshot(struct trace_array *tr) { /* * We don't free the ring buffer. instead, resize it because * The max_tr ring buffer has some state (e.g. ring->clock) and * we want preserve it. */ + ring_buffer_subbuf_order_set(tr->max_buffer.buffer, 0); ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); set_buffer_entries(&tr->max_buffer, 1); tracing_reset_online_cpus(&tr->max_buffer); tr->allocated_snapshot = false; } +static int tracing_arm_snapshot_locked(struct trace_array *tr) +{ + int ret; + + lockdep_assert_held(&trace_types_lock); + + spin_lock(&tr->snapshot_trigger_lock); + if (tr->snapshot == UINT_MAX || tr->mapped) { + spin_unlock(&tr->snapshot_trigger_lock); + return -EBUSY; + } + + tr->snapshot++; + spin_unlock(&tr->snapshot_trigger_lock); + + ret = tracing_alloc_snapshot_instance(tr); + if (ret) { + spin_lock(&tr->snapshot_trigger_lock); + tr->snapshot--; + spin_unlock(&tr->snapshot_trigger_lock); + } + + return ret; +} + +int tracing_arm_snapshot(struct trace_array *tr) +{ + guard(mutex)(&trace_types_lock); + return tracing_arm_snapshot_locked(tr); +} + +void tracing_disarm_snapshot(struct trace_array *tr) +{ + spin_lock(&tr->snapshot_trigger_lock); + if (!WARN_ON(!tr->snapshot)) + tr->snapshot--; + spin_unlock(&tr->snapshot_trigger_lock); +} + /** - * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. + * tracing_alloc_snapshot - allocate snapshot buffer. * - * This is similar to trace_snapshot(), but it will allocate the + * This only allocates the snapshot buffer if it isn't already + * allocated - it doesn't also take a snapshot. + * + * This is meant to be used in cases where the snapshot buffer needs + * to be set up for events that can't sleep but need to be able to + * trigger a snapshot. + */ +int tracing_alloc_snapshot(void) +{ + struct trace_array *tr = &global_trace; + int ret; + + ret = tracing_alloc_snapshot_instance(tr); + WARN_ON(ret < 0); + + return ret; +} +EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); + +/** + * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer. + * + * This is similar to tracing_snapshot(), but it will allocate the * snapshot buffer if it isn't already allocated. Use this only * where it is safe to sleep, as the allocation may sleep. * @@ -582,34 +1461,150 @@ void free_snapshot(struct trace_array *tr) */ void tracing_snapshot_alloc(void) { - struct trace_array *tr = &global_trace; int ret; - ret = alloc_snapshot(tr); - if (WARN_ON(ret < 0)) + ret = tracing_alloc_snapshot(); + if (ret < 0) return; tracing_snapshot(); } EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); + +/** + * tracing_snapshot_cond_enable - enable conditional snapshot for an instance + * @tr: The tracing instance + * @cond_data: User data to associate with the snapshot + * @update: Implementation of the cond_snapshot update function + * + * Check whether the conditional snapshot for the given instance has + * already been enabled, or if the current tracer is already using a + * snapshot; if so, return -EBUSY, else create a cond_snapshot and + * save the cond_data and update function inside. + * + * Returns 0 if successful, error otherwise. + */ +int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, + cond_update_fn_t update) +{ + struct cond_snapshot *cond_snapshot __free(kfree) = + kzalloc(sizeof(*cond_snapshot), GFP_KERNEL); + int ret; + + if (!cond_snapshot) + return -ENOMEM; + + cond_snapshot->cond_data = cond_data; + cond_snapshot->update = update; + + guard(mutex)(&trace_types_lock); + + if (tr->current_trace->use_max_tr) + return -EBUSY; + + /* + * The cond_snapshot can only change to NULL without the + * trace_types_lock. We don't care if we race with it going + * to NULL, but we want to make sure that it's not set to + * something other than NULL when we get here, which we can + * do safely with only holding the trace_types_lock and not + * having to take the max_lock. + */ + if (tr->cond_snapshot) + return -EBUSY; + + ret = tracing_arm_snapshot_locked(tr); + if (ret) + return ret; + + local_irq_disable(); + arch_spin_lock(&tr->max_lock); + tr->cond_snapshot = no_free_ptr(cond_snapshot); + arch_spin_unlock(&tr->max_lock); + local_irq_enable(); + + return 0; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); + +/** + * tracing_snapshot_cond_disable - disable conditional snapshot for an instance + * @tr: The tracing instance + * + * Check whether the conditional snapshot for the given instance is + * enabled; if so, free the cond_snapshot associated with it, + * otherwise return -EINVAL. + * + * Returns 0 if successful, error otherwise. + */ +int tracing_snapshot_cond_disable(struct trace_array *tr) +{ + int ret = 0; + + local_irq_disable(); + arch_spin_lock(&tr->max_lock); + + if (!tr->cond_snapshot) + ret = -EINVAL; + else { + kfree(tr->cond_snapshot); + tr->cond_snapshot = NULL; + } + + arch_spin_unlock(&tr->max_lock); + local_irq_enable(); + + tracing_disarm_snapshot(tr); + + return ret; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); #else void tracing_snapshot(void) { WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); } EXPORT_SYMBOL_GPL(tracing_snapshot); +void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) +{ + WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used"); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond); +int tracing_alloc_snapshot(void) +{ + WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used"); + return -ENODEV; +} +EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); void tracing_snapshot_alloc(void) { /* Give warning */ tracing_snapshot(); } EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); +void *tracing_cond_snapshot_data(struct trace_array *tr) +{ + return NULL; +} +EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); +int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update) +{ + return -ENODEV; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); +int tracing_snapshot_cond_disable(struct trace_array *tr) +{ + return false; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); +#define free_snapshot(tr) do { } while (0) +#define tracing_arm_snapshot_locked(tr) ({ -EBUSY; }) #endif /* CONFIG_TRACER_SNAPSHOT */ -static void tracer_tracing_off(struct trace_array *tr) +void tracer_tracing_off(struct trace_array *tr) { - if (tr->trace_buffer.buffer) - ring_buffer_record_off(tr->trace_buffer.buffer); + if (tr->array_buffer.buffer) + ring_buffer_record_off(tr->array_buffer.buffer); /* * This flag is looked at when buffers haven't been allocated * yet, or by some tracers (like irqsoff), that just want to @@ -619,8 +1614,39 @@ static void tracer_tracing_off(struct trace_array *tr) * important to be fast than accurate. */ tr->buffer_disabled = 1; - /* Make the flag seen by readers */ - smp_wmb(); +} + +/** + * tracer_tracing_disable() - temporary disable the buffer from write + * @tr: The trace array to disable its buffer for + * + * Expects trace_tracing_enable() to re-enable tracing. + * The difference between this and tracer_tracing_off() is that this + * is a counter and can nest, whereas, tracer_tracing_off() can + * be called multiple times and a single trace_tracing_on() will + * enable it. + */ +void tracer_tracing_disable(struct trace_array *tr) +{ + if (WARN_ON_ONCE(!tr->array_buffer.buffer)) + return; + + ring_buffer_record_disable(tr->array_buffer.buffer); +} + +/** + * tracer_tracing_enable() - counter part of tracer_tracing_disable() + * @tr: The trace array that had tracer_tracincg_disable() called on it + * + * This is called after tracer_tracing_disable() has been called on @tr, + * when it's safe to re-enable tracing. + */ +void tracer_tracing_enable(struct trace_array *tr) +{ + if (WARN_ON_ONCE(!tr->array_buffer.buffer)) + return; + + ring_buffer_record_enable(tr->array_buffer.buffer); } /** @@ -639,8 +1665,11 @@ EXPORT_SYMBOL_GPL(tracing_off); void disable_trace_on_warning(void) { - if (__disable_trace_on_warning) + if (__disable_trace_on_warning) { + trace_array_printk_buf(global_trace.array_buffer.buffer, _THIS_IP_, + "Disabling tracing due to warning\n"); tracing_off(); + } } /** @@ -649,10 +1678,10 @@ void disable_trace_on_warning(void) * * Shows real state of the ring buffer if it is enabled or not. */ -static int tracer_tracing_is_on(struct trace_array *tr) +bool tracer_tracing_is_on(struct trace_array *tr) { - if (tr->trace_buffer.buffer) - return ring_buffer_record_is_on(tr->trace_buffer.buffer); + if (tr->array_buffer.buffer) + return ring_buffer_record_is_set_on(tr->array_buffer.buffer); return !tr->buffer_disabled; } @@ -672,10 +1701,12 @@ static int __init set_buf_size(char *str) if (!str) return 0; buf_size = memparse(str, &str); - /* nr_entries can not be zero */ - if (buf_size == 0) - return 0; - trace_buf_size = buf_size; + /* + * nr_entries can not be zero and the startup + * tests require some buffer space. Therefore + * ensure we have at least 4096 bytes of buffer. + */ + trace_buf_size = max(4096UL, buf_size); return 1; } __setup("trace_buf_size=", set_buf_size); @@ -700,34 +1731,18 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) return nsecs / 1000; } -/* These must match the bit postions in trace_iterator_flags */ +/* + * TRACE_FLAGS is defined as a tuple matching bit masks with strings. + * It uses C(a, b) where 'a' is the eval (enum) name and 'b' is the string that + * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list + * of strings in the order that the evals (enum) were defined. + */ +#undef C +#define C(a, b) b + +/* These must match the bit positions in trace_iterator_flags */ static const char *trace_options[] = { - "print-parent", - "sym-offset", - "sym-addr", - "verbose", - "raw", - "hex", - "bin", - "block", - "stacktrace", - "trace_printk", - "ftrace_preempt", - "branch", - "annotate", - "userstacktrace", - "sym-userobj", - "printk-msg-only", - "context-info", - "latency-format", - "sleep-time", - "graph-time", - "record-cmd", - "overwrite", - "disable_on_free", - "irq-info", - "markers", - "function-trace", + TRACE_FLAGS NULL }; @@ -736,14 +1751,26 @@ static struct { const char *name; int in_ns; /* is this clock in nanoseconds? */ } trace_clocks[] = { - { trace_clock_local, "local", 1 }, - { trace_clock_global, "global", 1 }, - { trace_clock_counter, "counter", 0 }, - { trace_clock_jiffies, "uptime", 1 }, - { trace_clock, "perf", 1 }, + { trace_clock_local, "local", 1 }, + { trace_clock_global, "global", 1 }, + { trace_clock_counter, "counter", 0 }, + { trace_clock_jiffies, "uptime", 0 }, + { trace_clock, "perf", 1 }, + { ktime_get_mono_fast_ns, "mono", 1 }, + { ktime_get_raw_fast_ns, "mono_raw", 1 }, + { ktime_get_boot_fast_ns, "boot", 1 }, + { ktime_get_tai_fast_ns, "tai", 1 }, ARCH_TRACE_CLOCKS }; +bool trace_clock_in_ns(struct trace_array *tr) +{ + if (trace_clocks[tr->clock_id].in_ns) + return true; + + return false; +} + /* * trace_parser_get_init - gets the buffer for trace parser */ @@ -765,6 +1792,7 @@ int trace_parser_get_init(struct trace_parser *parser, int size) void trace_parser_put(struct trace_parser *parser) { kfree(parser->buffer); + parser->buffer = NULL; } /* @@ -790,7 +1818,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, ret = get_user(ch, ubuf++); if (ret) - goto out; + goto fail; read++; cnt--; @@ -804,134 +1832,162 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, while (cnt && isspace(ch)) { ret = get_user(ch, ubuf++); if (ret) - goto out; + goto fail; read++; cnt--; } + parser->idx = 0; + /* only spaces were written */ - if (isspace(ch)) { + if (isspace(ch) || !ch) { *ppos += read; - ret = read; - goto out; + return read; } - - parser->idx = 0; } /* read the non-space input */ - while (cnt && !isspace(ch)) { + while (cnt && !isspace(ch) && ch) { if (parser->idx < parser->size - 1) parser->buffer[parser->idx++] = ch; else { ret = -EINVAL; - goto out; + goto fail; } + ret = get_user(ch, ubuf++); if (ret) - goto out; + goto fail; read++; cnt--; } /* We either got finished input or we have to wait for another call. */ - if (isspace(ch)) { + if (isspace(ch) || !ch) { parser->buffer[parser->idx] = 0; parser->cont = false; - } else { + } else if (parser->idx < parser->size - 1) { parser->cont = true; parser->buffer[parser->idx++] = ch; + /* Make sure the parsed string always terminates with '\0'. */ + parser->buffer[parser->idx] = 0; + } else { + ret = -EINVAL; + goto fail; } *ppos += read; - ret = read; - -out: + return read; +fail: + trace_parser_fail(parser); return ret; } -ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) +/* TODO add a seq_buf_to_buffer() */ +static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) { int len; - int ret; - - if (!cnt) - return 0; - if (s->len <= s->readpos) + if (trace_seq_used(s) <= s->readpos) return -EBUSY; - len = s->len - s->readpos; + len = trace_seq_used(s) - s->readpos; if (cnt > len) cnt = len; - ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); - if (ret == cnt) - return -EFAULT; - - cnt -= ret; + memcpy(buf, s->buffer + s->readpos, cnt); s->readpos += cnt; return cnt; } -static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) +unsigned long __read_mostly tracing_thresh; + +#ifdef CONFIG_TRACER_MAX_TRACE +static const struct file_operations tracing_max_lat_fops; + +#ifdef LATENCY_FS_NOTIFY + +static struct workqueue_struct *fsnotify_wq; + +static void latency_fsnotify_workfn(struct work_struct *work) { - int len; + struct trace_array *tr = container_of(work, struct trace_array, + fsnotify_work); + fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY); +} - if (s->len <= s->readpos) - return -EBUSY; +static void latency_fsnotify_workfn_irq(struct irq_work *iwork) +{ + struct trace_array *tr = container_of(iwork, struct trace_array, + fsnotify_irqwork); + queue_work(fsnotify_wq, &tr->fsnotify_work); +} - len = s->len - s->readpos; - if (cnt > len) - cnt = len; - memcpy(buf, s->buffer + s->readpos, cnt); +static void trace_create_maxlat_file(struct trace_array *tr, + struct dentry *d_tracer) +{ + INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn); + init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); + tr->d_max_latency = trace_create_file("tracing_max_latency", + TRACE_MODE_WRITE, + d_tracer, tr, + &tracing_max_lat_fops); +} - s->readpos += cnt; - return cnt; +__init static int latency_fsnotify_init(void) +{ + fsnotify_wq = alloc_workqueue("tr_max_lat_wq", + WQ_UNBOUND | WQ_HIGHPRI, 0); + if (!fsnotify_wq) { + pr_err("Unable to allocate tr_max_lat_wq\n"); + return -ENOMEM; + } + return 0; } -/* - * ftrace_max_lock is used to protect the swapping of buffers - * when taking a max snapshot. The buffers themselves are - * protected by per_cpu spinlocks. But the action of the swap - * needs its own lock. - * - * This is defined as a arch_spinlock_t in order to help - * with performance when lockdep debugging is enabled. - * - * It is also used in other places outside the update_max_tr - * so it needs to be defined outside of the - * CONFIG_TRACER_MAX_TRACE. - */ -static arch_spinlock_t ftrace_max_lock = - (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +late_initcall_sync(latency_fsnotify_init); -unsigned long __read_mostly tracing_thresh; +void latency_fsnotify(struct trace_array *tr) +{ + if (!fsnotify_wq) + return; + /* + * We cannot call queue_work(&tr->fsnotify_work) from here because it's + * possible that we are called from __schedule() or do_idle(), which + * could cause a deadlock. + */ + irq_work_queue(&tr->fsnotify_irqwork); +} -#ifdef CONFIG_TRACER_MAX_TRACE -unsigned long __read_mostly tracing_max_latency; +#else /* !LATENCY_FS_NOTIFY */ + +#define trace_create_maxlat_file(tr, d_tracer) \ + trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \ + d_tracer, tr, &tracing_max_lat_fops) + +#endif /* * Copy the new maximum trace into the separate maximum-trace * structure. (this way the maximum trace is permanently saved, - * for later retrieval via /sys/kernel/debug/tracing/latency_trace) + * for later retrieval via /sys/kernel/tracing/tracing_max_latency) */ static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { - struct trace_buffer *trace_buf = &tr->trace_buffer; - struct trace_buffer *max_buf = &tr->max_buffer; + struct array_buffer *trace_buf = &tr->array_buffer; + struct array_buffer *max_buf = &tr->max_buffer; struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu); struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu); max_buf->cpu = cpu; max_buf->time_start = data->preempt_timestamp; - max_data->saved_latency = tracing_max_latency; + max_data->saved_latency = tr->max_latency; max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; - memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); + strscpy(max_data->comm, tsk->comm); max_data->pid = tsk->pid; /* * If tsk == current, then use current_uid(), as that does not use @@ -948,6 +2004,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) /* record this tasks comm */ tracing_record_cmdline(tsk); + latency_fsnotify(tr); } /** @@ -955,15 +2012,15 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) * @tr: tracer * @tsk: the task with the latency * @cpu: The cpu that initiated the trace. + * @cond_data: User data associated with a conditional snapshot * * Flip the buffers between the @tr and the max_tr and record information * about which task was the cause of this latency. */ void -update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, + void *cond_data) { - struct ring_buffer *buf; - if (tr->stop_count) return; @@ -975,21 +2032,35 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) return; } - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&tr->max_lock); - buf = tr->trace_buffer.buffer; - tr->trace_buffer.buffer = tr->max_buffer.buffer; - tr->max_buffer.buffer = buf; + /* Inherit the recordable setting from array_buffer */ + if (ring_buffer_record_is_set_on(tr->array_buffer.buffer)) + ring_buffer_record_on(tr->max_buffer.buffer); + else + ring_buffer_record_off(tr->max_buffer.buffer); + +#ifdef CONFIG_TRACER_SNAPSHOT + if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) { + arch_spin_unlock(&tr->max_lock); + return; + } +#endif + swap(tr->array_buffer.buffer, tr->max_buffer.buffer); __update_max_tr(tr, tsk, cpu); - arch_spin_unlock(&ftrace_max_lock); + + arch_spin_unlock(&tr->max_lock); + + /* Any waiters on the old snapshot buffer need to wake up */ + ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS); } /** * update_max_tr_single - only copy one trace over, and reset the rest - * @tr - tracer - * @tsk - task with the latency - * @cpu - the cpu of the buffer to copy. + * @tr: tracer + * @tsk: task with the latency + * @cpu: the cpu of the buffer to copy. * * Flip the trace of a single CPU buffer between the @tr and the max_tr. */ @@ -1008,9 +2079,9 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) return; } - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&tr->max_lock); - ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu); + ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->array_buffer.buffer, cpu); if (ret == -EBUSY) { /* @@ -1018,31 +2089,89 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) * place on this CPU. We fail to record, but we reset * the max trace buffer (no one writes directly to it) * and flag that it failed. + * Another reason is resize is in progress. */ trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_, - "Failed to swap buffers due to commit in progress\n"); + "Failed to swap buffers due to commit or resize in progress\n"); } WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); __update_max_tr(tr, tsk, cpu); - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&tr->max_lock); } + #endif /* CONFIG_TRACER_MAX_TRACE */ -static void default_wait_pipe(struct trace_iterator *iter) +struct pipe_wait { + struct trace_iterator *iter; + int wait_index; +}; + +static bool wait_pipe_cond(void *data) +{ + struct pipe_wait *pwait = data; + struct trace_iterator *iter = pwait->iter; + + if (atomic_read_acquire(&iter->wait_index) != pwait->wait_index) + return true; + + return iter->closed; +} + +static int wait_on_pipe(struct trace_iterator *iter, int full) { + struct pipe_wait pwait; + int ret; + /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) - return; + return 0; + + pwait.wait_index = atomic_read_acquire(&iter->wait_index); + pwait.iter = iter; + + ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full, + wait_pipe_cond, &pwait); - ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); +#ifdef CONFIG_TRACER_MAX_TRACE + /* + * Make sure this is still the snapshot buffer, as if a snapshot were + * to happen, this would now be the main buffer. + */ + if (iter->snapshot) + iter->array_buffer = &iter->tr->max_buffer; +#endif + return ret; } #ifdef CONFIG_FTRACE_STARTUP_TEST +static bool selftests_can_run; + +struct trace_selftests { + struct list_head list; + struct tracer *type; +}; + +static LIST_HEAD(postponed_selftests); + +static int save_selftest(struct tracer *type) +{ + struct trace_selftests *selftest; + + selftest = kmalloc(sizeof(*selftest), GFP_KERNEL); + if (!selftest) + return -ENOMEM; + + selftest->type = type; + list_add(&selftest->list, &postponed_selftests); + return 0; +} + static int run_tracer_selftest(struct tracer *type) { struct trace_array *tr = &global_trace; + struct tracer_flags *saved_flags = tr->current_trace_flags; struct tracer *saved_tracer = tr->current_trace; int ret; @@ -1050,20 +2179,35 @@ static int run_tracer_selftest(struct tracer *type) return 0; /* + * If a tracer registers early in boot up (before scheduling is + * initialized and such), then do not run its selftests yet. + * Instead, run it a little later in the boot process. + */ + if (!selftests_can_run) + return save_selftest(type); + + if (!tracing_is_on()) { + pr_warn("Selftest for tracer %s skipped due to tracing disabled\n", + type->name); + return 0; + } + + /* * Run a selftest on this tracer. * Here we reset the trace buffer, and set the current * tracer to be this tracer. The tracer can then run some * internal tracing to verify that everything is in order. * If we fail, we do not register this tracer. */ - tracing_reset_online_cpus(&tr->trace_buffer); + tracing_reset_online_cpus(&tr->array_buffer); tr->current_trace = type; + tr->current_trace_flags = type->flags ? : type->default_flags; #ifdef CONFIG_TRACER_MAX_TRACE if (type->use_max_tr) { /* If we expanded the buffers, make sure the max is expanded too */ - if (ring_buffer_expanded) + if (tr->ring_buffer_expanded) ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size, RING_BUFFER_ALL_CPUS); tr->allocated_snapshot = true; @@ -1075,6 +2219,7 @@ static int run_tracer_selftest(struct tracer *type) ret = type->selftest(type, tr); /* the test is responsible for resetting too */ tr->current_trace = saved_tracer; + tr->current_trace_flags = saved_flags; if (ret) { printk(KERN_CONT "FAILED!\n"); /* Add the warning after printing 'FAILED' */ @@ -1082,14 +2227,14 @@ static int run_tracer_selftest(struct tracer *type) return -1; } /* Only reset on passing, to avoid touching corrupted buffers */ - tracing_reset_online_cpus(&tr->trace_buffer); + tracing_reset_online_cpus(&tr->array_buffer); #ifdef CONFIG_TRACER_MAX_TRACE if (type->use_max_tr) { tr->allocated_snapshot = false; /* Shrink the max buffer again */ - if (ring_buffer_expanded) + if (tr->ring_buffer_expanded) ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); } @@ -1098,21 +2243,101 @@ static int run_tracer_selftest(struct tracer *type) printk(KERN_CONT "PASSED\n"); return 0; } + +static int do_run_tracer_selftest(struct tracer *type) +{ + int ret; + + /* + * Tests can take a long time, especially if they are run one after the + * other, as does happen during bootup when all the tracers are + * registered. This could cause the soft lockup watchdog to trigger. + */ + cond_resched(); + + tracing_selftest_running = true; + ret = run_tracer_selftest(type); + tracing_selftest_running = false; + + return ret; +} + +static __init int init_trace_selftests(void) +{ + struct trace_selftests *p, *n; + struct tracer *t, **last; + int ret; + + selftests_can_run = true; + + guard(mutex)(&trace_types_lock); + + if (list_empty(&postponed_selftests)) + return 0; + + pr_info("Running postponed tracer tests:\n"); + + tracing_selftest_running = true; + list_for_each_entry_safe(p, n, &postponed_selftests, list) { + /* This loop can take minutes when sanitizers are enabled, so + * lets make sure we allow RCU processing. + */ + cond_resched(); + ret = run_tracer_selftest(p->type); + /* If the test fails, then warn and remove from available_tracers */ + if (ret < 0) { + WARN(1, "tracer: %s failed selftest, disabling\n", + p->type->name); + last = &trace_types; + for (t = trace_types; t; t = t->next) { + if (t == p->type) { + *last = t->next; + break; + } + last = &t->next; + } + } + list_del(&p->list); + kfree(p); + } + tracing_selftest_running = false; + + return 0; +} +core_initcall(init_trace_selftests); #else -static inline int run_tracer_selftest(struct tracer *type) +static inline int do_run_tracer_selftest(struct tracer *type) { return 0; } #endif /* CONFIG_FTRACE_STARTUP_TEST */ +static int add_tracer(struct trace_array *tr, struct tracer *t); + +static void __init apply_trace_boot_options(void); + +static void free_tracers(struct trace_array *tr) +{ + struct tracers *t, *n; + + lockdep_assert_held(&trace_types_lock); + + list_for_each_entry_safe(t, n, &tr->tracers, list) { + list_del(&t->list); + kfree(t->flags); + kfree(t); + } +} + /** * register_tracer - register a tracer with the ftrace system. - * @type - the plugin for the tracer + * @type: the plugin for the tracer * * Register a new plugin tracer. */ -int register_tracer(struct tracer *type) +int __init register_tracer(struct tracer *type) { + struct trace_array *tr; struct tracer *t; int ret = 0; @@ -1126,9 +2351,13 @@ int register_tracer(struct tracer *type) return -1; } - mutex_lock(&trace_types_lock); + if (security_locked_down(LOCKDOWN_TRACEFS)) { + pr_warn("Can not register tracer %s due to lockdown\n", + type->name); + return -EPERM; + } - tracing_selftest_running = true; + mutex_lock(&trace_types_lock); for (t = trace_types; t; t = t->next) { if (strcmp(type->name, t->name) == 0) { @@ -1140,51 +2369,51 @@ int register_tracer(struct tracer *type) } } - if (!type->set_flag) - type->set_flag = &dummy_set_flag; - if (!type->flags) - type->flags = &dummy_tracer_flags; - else - if (!type->flags->opts) - type->flags->opts = dummy_tracer_opt; - if (!type->wait_pipe) - type->wait_pipe = default_wait_pipe; + /* store the tracer for __set_tracer_option */ + if (type->flags) + type->flags->trace = type; - ret = run_tracer_selftest(type); + ret = do_run_tracer_selftest(type); if (ret < 0) goto out; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + ret = add_tracer(tr, type); + if (ret < 0) { + /* The tracer will still exist but without options */ + pr_warn("Failed to create tracer options for %s\n", type->name); + break; + } + } + type->next = trace_types; trace_types = type; out: - tracing_selftest_running = false; mutex_unlock(&trace_types_lock); if (ret || !default_bootup_tracer) - goto out_unlock; + return ret; if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE)) - goto out_unlock; + return 0; printk(KERN_INFO "Starting tracer '%s'\n", type->name); /* Do we want this tracer to start on bootup? */ - tracing_set_tracer(type->name); + WARN_ON(tracing_set_tracer(&global_trace, type->name) < 0); default_bootup_tracer = NULL; + + apply_trace_boot_options(); + /* disable other selftests, since this will break it. */ - tracing_selftest_disabled = true; -#ifdef CONFIG_FTRACE_STARTUP_TEST - printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n", - type->name); -#endif + disable_tracing_selftest("running a tracer"); - out_unlock: - return ret; + return 0; } -void tracing_reset(struct trace_buffer *buf, int cpu) +static void tracing_reset_cpu(struct array_buffer *buf, int cpu) { - struct ring_buffer *buffer = buf->buffer; + struct trace_buffer *buffer = buf->buffer; if (!buffer) return; @@ -1192,16 +2421,15 @@ void tracing_reset(struct trace_buffer *buf, int cpu) ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ - synchronize_sched(); + synchronize_rcu(); ring_buffer_reset_cpu(buffer, cpu); ring_buffer_record_enable(buffer); } -void tracing_reset_online_cpus(struct trace_buffer *buf) +void tracing_reset_online_cpus(struct array_buffer *buf) { - struct ring_buffer *buffer = buf->buffer; - int cpu; + struct trace_buffer *buffer = buf->buffer; if (!buffer) return; @@ -1209,51 +2437,56 @@ void tracing_reset_online_cpus(struct trace_buffer *buf) ring_buffer_record_disable(buffer); /* Make sure all commits have finished */ - synchronize_sched(); + synchronize_rcu(); - buf->time_start = ftrace_now(buf->cpu); + buf->time_start = buffer_ftrace_now(buf, buf->cpu); - for_each_online_cpu(cpu) - ring_buffer_reset_cpu(buffer, cpu); + ring_buffer_reset_online_cpus(buffer); ring_buffer_record_enable(buffer); } -void tracing_reset_current(int cpu) +static void tracing_reset_all_cpus(struct array_buffer *buf) { - tracing_reset(&global_trace.trace_buffer, cpu); + struct trace_buffer *buffer = buf->buffer; + + if (!buffer) + return; + + ring_buffer_record_disable(buffer); + + /* Make sure all commits have finished */ + synchronize_rcu(); + + buf->time_start = buffer_ftrace_now(buf, buf->cpu); + + ring_buffer_reset(buffer); + + ring_buffer_record_enable(buffer); } -void tracing_reset_all_online_cpus(void) +/* Must have trace_types_lock held */ +void tracing_reset_all_online_cpus_unlocked(void) { struct trace_array *tr; - mutex_lock(&trace_types_lock); + lockdep_assert_held(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { - tracing_reset_online_cpus(&tr->trace_buffer); + if (!tr->clear_trace) + continue; + tr->clear_trace = false; + tracing_reset_online_cpus(&tr->array_buffer); #ifdef CONFIG_TRACER_MAX_TRACE tracing_reset_online_cpus(&tr->max_buffer); #endif } - mutex_unlock(&trace_types_lock); } -#define SAVED_CMDLINES 128 -#define NO_CMDLINE_MAP UINT_MAX -static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; -static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; -static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; -static int cmdline_idx; -static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; - -/* temporary disable recording */ -static atomic_t trace_record_cmdline_disabled __read_mostly; - -static void trace_init_cmdlines(void) +void tracing_reset_all_online_cpus(void) { - memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline)); - memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid)); - cmdline_idx = 0; + guard(mutex)(&trace_types_lock); + tracing_reset_all_online_cpus_unlocked(); } int is_tracing_stopped(void) @@ -1261,94 +2494,72 @@ int is_tracing_stopped(void) return global_trace.stop_count; } -/** - * ftrace_off_permanent - disable all ftrace code permanently - * - * This should only be called when a serious anomally has - * been detected. This will turn off the function tracing, - * ring buffers, and other tracing utilites. It takes no - * locks and can be called from any context. - */ -void ftrace_off_permanent(void) -{ - tracing_disabled = 1; - ftrace_stop(); - tracing_off_permanent(); -} - -/** - * tracing_start - quick start of the tracer - * - * If tracing is enabled but was stopped by tracing_stop, - * this will start the tracer back up. - */ -void tracing_start(void) +static void tracing_start_tr(struct trace_array *tr) { - struct ring_buffer *buffer; - unsigned long flags; + struct trace_buffer *buffer; if (tracing_disabled) return; - raw_spin_lock_irqsave(&global_trace.start_lock, flags); - if (--global_trace.stop_count) { - if (global_trace.stop_count < 0) { + guard(raw_spinlock_irqsave)(&tr->start_lock); + if (--tr->stop_count) { + if (WARN_ON_ONCE(tr->stop_count < 0)) { /* Someone screwed up their debugging */ - WARN_ON_ONCE(1); - global_trace.stop_count = 0; + tr->stop_count = 0; } - goto out; + return; } /* Prevent the buffers from switching */ - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&tr->max_lock); - buffer = global_trace.trace_buffer.buffer; + buffer = tr->array_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); #ifdef CONFIG_TRACER_MAX_TRACE - buffer = global_trace.max_buffer.buffer; + buffer = tr->max_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); #endif - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&tr->max_lock); +} - ftrace_start(); - out: - raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); +/** + * tracing_start - quick start of the tracer + * + * If tracing is enabled but was stopped by tracing_stop, + * this will start the tracer back up. + */ +void tracing_start(void) + +{ + return tracing_start_tr(&global_trace); } -static void tracing_start_tr(struct trace_array *tr) +static void tracing_stop_tr(struct trace_array *tr) { - struct ring_buffer *buffer; - unsigned long flags; + struct trace_buffer *buffer; - if (tracing_disabled) + guard(raw_spinlock_irqsave)(&tr->start_lock); + if (tr->stop_count++) return; - /* If global, we need to also start the max tracer */ - if (tr->flags & TRACE_ARRAY_FL_GLOBAL) - return tracing_start(); - - raw_spin_lock_irqsave(&tr->start_lock, flags); + /* Prevent the buffers from switching */ + arch_spin_lock(&tr->max_lock); - if (--tr->stop_count) { - if (tr->stop_count < 0) { - /* Someone screwed up their debugging */ - WARN_ON_ONCE(1); - tr->stop_count = 0; - } - goto out; - } + buffer = tr->array_buffer.buffer; + if (buffer) + ring_buffer_record_disable(buffer); - buffer = tr->trace_buffer.buffer; +#ifdef CONFIG_TRACER_MAX_TRACE + buffer = tr->max_buffer.buffer; if (buffer) - ring_buffer_record_enable(buffer); + ring_buffer_record_disable(buffer); +#endif - out: - raw_spin_unlock_irqrestore(&tr->start_lock, flags); + arch_spin_unlock(&tr->max_lock); } /** @@ -1359,395 +2570,554 @@ static void tracing_start_tr(struct trace_array *tr) */ void tracing_stop(void) { - struct ring_buffer *buffer; - unsigned long flags; + return tracing_stop_tr(&global_trace); +} - ftrace_stop(); - raw_spin_lock_irqsave(&global_trace.start_lock, flags); - if (global_trace.stop_count++) - goto out; +/* + * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq + * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function + * simplifies those functions and keeps them in sync. + */ +enum print_line_t trace_handle_return(struct trace_seq *s) +{ + return trace_seq_has_overflowed(s) ? + TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED; +} +EXPORT_SYMBOL_GPL(trace_handle_return); - /* Prevent the buffers from switching */ - arch_spin_lock(&ftrace_max_lock); +static unsigned short migration_disable_value(void) +{ +#if defined(CONFIG_SMP) + return current->migration_disabled; +#else + return 0; +#endif +} - buffer = global_trace.trace_buffer.buffer; - if (buffer) - ring_buffer_record_disable(buffer); +unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) +{ + unsigned int trace_flags = irqs_status; + unsigned int pc; -#ifdef CONFIG_TRACER_MAX_TRACE - buffer = global_trace.max_buffer.buffer; - if (buffer) - ring_buffer_record_disable(buffer); -#endif + pc = preempt_count(); - arch_spin_unlock(&ftrace_max_lock); + if (pc & NMI_MASK) + trace_flags |= TRACE_FLAG_NMI; + if (pc & HARDIRQ_MASK) + trace_flags |= TRACE_FLAG_HARDIRQ; + if (in_serving_softirq()) + trace_flags |= TRACE_FLAG_SOFTIRQ; + if (softirq_count() >> (SOFTIRQ_SHIFT + 1)) + trace_flags |= TRACE_FLAG_BH_OFF; + + if (tif_need_resched()) + trace_flags |= TRACE_FLAG_NEED_RESCHED; + if (test_preempt_need_resched()) + trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; + if (IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY) && tif_test_bit(TIF_NEED_RESCHED_LAZY)) + trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY; + return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) | + (min_t(unsigned int, migration_disable_value(), 0xf)) << 4; +} - out: - raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); +struct ring_buffer_event * +trace_buffer_lock_reserve(struct trace_buffer *buffer, + int type, + unsigned long len, + unsigned int trace_ctx) +{ + return __trace_buffer_lock_reserve(buffer, type, len, trace_ctx); } -static void tracing_stop_tr(struct trace_array *tr) +DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); +DEFINE_PER_CPU(int, trace_buffered_event_cnt); +static int trace_buffered_event_ref; + +/** + * trace_buffered_event_enable - enable buffering events + * + * When events are being filtered, it is quicker to use a temporary + * buffer to write the event data into if there's a likely chance + * that it will not be committed. The discard of the ring buffer + * is not as fast as committing, and is much slower than copying + * a commit. + * + * When an event is to be filtered, allocate per cpu buffers to + * write the event data into, and if the event is filtered and discarded + * it is simply dropped, otherwise, the entire data is to be committed + * in one shot. + */ +void trace_buffered_event_enable(void) { - struct ring_buffer *buffer; - unsigned long flags; + struct ring_buffer_event *event; + struct page *page; + int cpu; - /* If global, we need to also stop the max tracer */ - if (tr->flags & TRACE_ARRAY_FL_GLOBAL) - return tracing_stop(); + WARN_ON_ONCE(!mutex_is_locked(&event_mutex)); - raw_spin_lock_irqsave(&tr->start_lock, flags); - if (tr->stop_count++) - goto out; + if (trace_buffered_event_ref++) + return; - buffer = tr->trace_buffer.buffer; - if (buffer) - ring_buffer_record_disable(buffer); + for_each_tracing_cpu(cpu) { + page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_NORETRY, 0); + /* This is just an optimization and can handle failures */ + if (!page) { + pr_err("Failed to allocate event buffer\n"); + break; + } - out: - raw_spin_unlock_irqrestore(&tr->start_lock, flags); + event = page_address(page); + memset(event, 0, sizeof(*event)); + + per_cpu(trace_buffered_event, cpu) = event; + + scoped_guard(preempt,) { + if (cpu == smp_processor_id() && + __this_cpu_read(trace_buffered_event) != + per_cpu(trace_buffered_event, cpu)) + WARN_ON_ONCE(1); + } + } } -void trace_stop_cmdline_recording(void); +static void enable_trace_buffered_event(void *data) +{ + this_cpu_dec(trace_buffered_event_cnt); +} -static void trace_save_cmdline(struct task_struct *tsk) +static void disable_trace_buffered_event(void *data) { - unsigned pid, idx; + this_cpu_inc(trace_buffered_event_cnt); +} - if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) - return; +/** + * trace_buffered_event_disable - disable buffering events + * + * When a filter is removed, it is faster to not use the buffered + * events, and to commit directly into the ring buffer. Free up + * the temp buffers when there are no more users. This requires + * special synchronization with current events. + */ +void trace_buffered_event_disable(void) +{ + int cpu; - /* - * It's not the end of the world if we don't get - * the lock, but we also don't want to spin - * nor do we want to disable interrupts, - * so if we miss here, then better luck next time. - */ - if (!arch_spin_trylock(&trace_cmdline_lock)) + WARN_ON_ONCE(!mutex_is_locked(&event_mutex)); + + if (WARN_ON_ONCE(!trace_buffered_event_ref)) return; - idx = map_pid_to_cmdline[tsk->pid]; - if (idx == NO_CMDLINE_MAP) { - idx = (cmdline_idx + 1) % SAVED_CMDLINES; + if (--trace_buffered_event_ref) + return; - /* - * Check whether the cmdline buffer at idx has a pid - * mapped. We are going to overwrite that entry so we - * need to clear the map_pid_to_cmdline. Otherwise we - * would read the new comm for the old pid. - */ - pid = map_cmdline_to_pid[idx]; - if (pid != NO_CMDLINE_MAP) - map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; + /* For each CPU, set the buffer as used. */ + on_each_cpu_mask(tracing_buffer_mask, disable_trace_buffered_event, + NULL, true); - map_cmdline_to_pid[idx] = tsk->pid; - map_pid_to_cmdline[tsk->pid] = idx; + /* Wait for all current users to finish */ + synchronize_rcu(); - cmdline_idx = idx; + for_each_tracing_cpu(cpu) { + free_page((unsigned long)per_cpu(trace_buffered_event, cpu)); + per_cpu(trace_buffered_event, cpu) = NULL; } - memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); + /* + * Wait for all CPUs that potentially started checking if they can use + * their event buffer only after the previous synchronize_rcu() call and + * they still read a valid pointer from trace_buffered_event. It must be + * ensured they don't see cleared trace_buffered_event_cnt else they + * could wrongly decide to use the pointed-to buffer which is now freed. + */ + synchronize_rcu(); - arch_spin_unlock(&trace_cmdline_lock); + /* For each CPU, relinquish the buffer */ + on_each_cpu_mask(tracing_buffer_mask, enable_trace_buffered_event, NULL, + true); } -void trace_find_cmdline(int pid, char comm[]) +static struct trace_buffer *temp_buffer; + +struct ring_buffer_event * +trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, + struct trace_event_file *trace_file, + int type, unsigned long len, + unsigned int trace_ctx) { - unsigned map; + struct ring_buffer_event *entry; + struct trace_array *tr = trace_file->tr; + int val; - if (!pid) { - strcpy(comm, "<idle>"); - return; - } + *current_rb = tr->array_buffer.buffer; - if (WARN_ON_ONCE(pid < 0)) { - strcpy(comm, "<XXX>"); - return; - } + if (!tr->no_filter_buffering_ref && + (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED))) { + preempt_disable_notrace(); + /* + * Filtering is on, so try to use the per cpu buffer first. + * This buffer will simulate a ring_buffer_event, + * where the type_len is zero and the array[0] will + * hold the full length. + * (see include/linux/ring-buffer.h for details on + * how the ring_buffer_event is structured). + * + * Using a temp buffer during filtering and copying it + * on a matched filter is quicker than writing directly + * into the ring buffer and then discarding it when + * it doesn't match. That is because the discard + * requires several atomic operations to get right. + * Copying on match and doing nothing on a failed match + * is still quicker than no copy on match, but having + * to discard out of the ring buffer on a failed match. + */ + if ((entry = __this_cpu_read(trace_buffered_event))) { + int max_len = PAGE_SIZE - struct_size(entry, array, 1); - if (pid > PID_MAX_DEFAULT) { - strcpy(comm, "<...>"); - return; - } + val = this_cpu_inc_return(trace_buffered_event_cnt); - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); - map = map_pid_to_cmdline[pid]; - if (map != NO_CMDLINE_MAP) - strcpy(comm, saved_cmdlines[map]); - else - strcpy(comm, "<...>"); + /* + * Preemption is disabled, but interrupts and NMIs + * can still come in now. If that happens after + * the above increment, then it will have to go + * back to the old method of allocating the event + * on the ring buffer, and if the filter fails, it + * will have to call ring_buffer_discard_commit() + * to remove it. + * + * Need to also check the unlikely case that the + * length is bigger than the temp buffer size. + * If that happens, then the reserve is pretty much + * guaranteed to fail, as the ring buffer currently + * only allows events less than a page. But that may + * change in the future, so let the ring buffer reserve + * handle the failure in that case. + */ + if (val == 1 && likely(len <= max_len)) { + trace_event_setup(entry, type, trace_ctx); + entry->array[0] = len; + /* Return with preemption disabled */ + return entry; + } + this_cpu_dec(trace_buffered_event_cnt); + } + /* __trace_buffer_lock_reserve() disables preemption */ + preempt_enable_notrace(); + } - arch_spin_unlock(&trace_cmdline_lock); - preempt_enable(); + entry = __trace_buffer_lock_reserve(*current_rb, type, len, + trace_ctx); + /* + * If tracing is off, but we have triggers enabled + * we still need to look at the event data. Use the temp_buffer + * to store the trace event for the trigger to use. It's recursive + * safe and will not be recorded anywhere. + */ + if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) { + *current_rb = temp_buffer; + entry = __trace_buffer_lock_reserve(*current_rb, type, len, + trace_ctx); + } + return entry; } +EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); + +static DEFINE_RAW_SPINLOCK(tracepoint_iter_lock); +static DEFINE_MUTEX(tracepoint_printk_mutex); -void tracing_record_cmdline(struct task_struct *tsk) +static void output_printk(struct trace_event_buffer *fbuffer) { - if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on()) + struct trace_event_call *event_call; + struct trace_event_file *file; + struct trace_event *event; + unsigned long flags; + struct trace_iterator *iter = tracepoint_print_iter; + + /* We should never get here if iter is NULL */ + if (WARN_ON_ONCE(!iter)) return; - if (!__this_cpu_read(trace_cmdline_save)) + event_call = fbuffer->trace_file->event_call; + if (!event_call || !event_call->event.funcs || + !event_call->event.funcs->trace) return; - __this_cpu_write(trace_cmdline_save, false); + file = fbuffer->trace_file; + if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || + (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && + !filter_match_preds(file->filter, fbuffer->entry))) + return; - trace_save_cmdline(tsk); -} + event = &fbuffer->trace_file->event_call->event; -void -tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, - int pc) -{ - struct task_struct *tsk = current; + raw_spin_lock_irqsave(&tracepoint_iter_lock, flags); + trace_seq_init(&iter->seq); + iter->ent = fbuffer->entry; + event_call->event.funcs->trace(iter, 0, event); + trace_seq_putc(&iter->seq, 0); + printk("%s", iter->seq.buffer); - entry->preempt_count = pc & 0xff; - entry->pid = (tsk) ? tsk->pid : 0; - entry->flags = -#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT - (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | -#else - TRACE_FLAG_IRQS_NOSUPPORT | -#endif - ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | - ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | - (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); + raw_spin_unlock_irqrestore(&tracepoint_iter_lock, flags); } -EXPORT_SYMBOL_GPL(tracing_generic_entry_update); -struct ring_buffer_event * -trace_buffer_lock_reserve(struct ring_buffer *buffer, - int type, - unsigned long len, - unsigned long flags, int pc) +int tracepoint_printk_sysctl(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, + loff_t *ppos) { - struct ring_buffer_event *event; + int save_tracepoint_printk; + int ret; - event = ring_buffer_lock_reserve(buffer, len); - if (event != NULL) { - struct trace_entry *ent = ring_buffer_event_data(event); + guard(mutex)(&tracepoint_printk_mutex); + save_tracepoint_printk = tracepoint_printk; - tracing_generic_entry_update(ent, flags, pc); - ent->type = type; - } + ret = proc_dointvec(table, write, buffer, lenp, ppos); - return event; -} + /* + * This will force exiting early, as tracepoint_printk + * is always zero when tracepoint_printk_iter is not allocated + */ + if (!tracepoint_print_iter) + tracepoint_printk = 0; -void -__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) -{ - __this_cpu_write(trace_cmdline_save, true); - ring_buffer_unlock_commit(buffer, event); -} + if (save_tracepoint_printk == tracepoint_printk) + return ret; -static inline void -__trace_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc) -{ - __buffer_unlock_commit(buffer, event); + if (tracepoint_printk) + static_key_enable(&tracepoint_printk_key.key); + else + static_key_disable(&tracepoint_printk_key.key); - ftrace_trace_stack(buffer, flags, 6, pc); - ftrace_trace_userstack(buffer, flags, pc); + return ret; } -void trace_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc) +void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) { - __trace_buffer_unlock_commit(buffer, event, flags, pc); -} -EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); + enum event_trigger_type tt = ETT_NONE; + struct trace_event_file *file = fbuffer->trace_file; -struct ring_buffer_event * -trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, - struct ftrace_event_file *ftrace_file, - int type, unsigned long len, - unsigned long flags, int pc) -{ - *current_rb = ftrace_file->tr->trace_buffer.buffer; - return trace_buffer_lock_reserve(*current_rb, - type, len, flags, pc); -} -EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); + if (__event_trigger_test_discard(file, fbuffer->buffer, fbuffer->event, + fbuffer->entry, &tt)) + goto discard; -struct ring_buffer_event * -trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, - int type, unsigned long len, - unsigned long flags, int pc) -{ - *current_rb = global_trace.trace_buffer.buffer; - return trace_buffer_lock_reserve(*current_rb, - type, len, flags, pc); -} -EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); + if (static_key_false(&tracepoint_printk_key.key)) + output_printk(fbuffer); + + if (static_branch_unlikely(&trace_event_exports_enabled)) + ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT); + + trace_buffer_unlock_commit_regs(file->tr, fbuffer->buffer, + fbuffer->event, fbuffer->trace_ctx, fbuffer->regs); + +discard: + if (tt) + event_triggers_post_call(file, tt); -void trace_current_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc) -{ - __trace_buffer_unlock_commit(buffer, event, flags, pc); } -EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); +EXPORT_SYMBOL_GPL(trace_event_buffer_commit); + +/* + * Skip 3: + * + * trace_buffer_unlock_commit_regs() + * trace_event_buffer_commit() + * trace_event_raw_event_xxx() + */ +# define STACK_SKIP 3 -void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer, +void trace_buffer_unlock_commit_regs(struct trace_array *tr, + struct trace_buffer *buffer, struct ring_buffer_event *event, - unsigned long flags, int pc, + unsigned int trace_ctx, struct pt_regs *regs) { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); - ftrace_trace_userstack(buffer, flags, pc); + /* + * If regs is not set, then skip the necessary functions. + * Note, we can still get here via blktrace, wakeup tracer + * and mmiotrace, but that's ok if they lose a function or + * two. They are not that meaningful. + */ + ftrace_trace_stack(tr, buffer, trace_ctx, regs ? 0 : STACK_SKIP, regs); + ftrace_trace_userstack(tr, buffer, trace_ctx); } -EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); -void trace_current_buffer_discard_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event) +/* + * Similar to trace_buffer_unlock_commit_regs() but do not dump stack. + */ +void +trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, + struct ring_buffer_event *event) { - ring_buffer_discard_commit(buffer, event); + __buffer_unlock_commit(buffer, event); } -EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit); void -trace_function(struct trace_array *tr, - unsigned long ip, unsigned long parent_ip, unsigned long flags, - int pc) +trace_function(struct trace_array *tr, unsigned long ip, unsigned long + parent_ip, unsigned int trace_ctx, struct ftrace_regs *fregs) { - struct ftrace_event_call *call = &event_function; - struct ring_buffer *buffer = tr->trace_buffer.buffer; + struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; + int size = sizeof(*entry); - /* If we are reading the ring buffer, don't trace */ - if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) - return; + size += FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long); - event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), - flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_FN, size, + trace_ctx); if (!event) return; entry = ring_buffer_event_data(event); entry->ip = ip; entry->parent_ip = parent_ip; - if (!filter_check_discard(call, entry, buffer, event)) - __buffer_unlock_commit(buffer, event); +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + if (fregs) { + for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++) + entry->args[i] = ftrace_regs_get_argument(fregs, i); + } +#endif + + if (static_branch_unlikely(&trace_function_exports_enabled)) + ftrace_exports(event, TRACE_EXPORT_FUNCTION); + __buffer_unlock_commit(buffer, event); } #ifdef CONFIG_STACKTRACE -#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) +/* Allow 4 levels of nesting: normal, softirq, irq, NMI */ +#define FTRACE_KSTACK_NESTING 4 + +#define FTRACE_KSTACK_ENTRIES (SZ_4K / FTRACE_KSTACK_NESTING) + struct ftrace_stack { - unsigned long calls[FTRACE_STACK_MAX_ENTRIES]; + unsigned long calls[FTRACE_KSTACK_ENTRIES]; }; -static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack); + +struct ftrace_stacks { + struct ftrace_stack stacks[FTRACE_KSTACK_NESTING]; +}; + +static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks); static DEFINE_PER_CPU(int, ftrace_stack_reserve); -static void __ftrace_trace_stack(struct ring_buffer *buffer, - unsigned long flags, - int skip, int pc, struct pt_regs *regs) +static void __ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, + unsigned int trace_ctx, + int skip, struct pt_regs *regs) { - struct ftrace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; + unsigned int size, nr_entries; + struct ftrace_stack *fstack; struct stack_entry *entry; - struct stack_trace trace; - int use_stack; - int size = FTRACE_STACK_ENTRIES; - - trace.nr_entries = 0; - trace.skip = skip; + int stackidx; /* - * Since events can happen in NMIs there's no safe way to - * use the per cpu ftrace_stacks. We reserve it and if an interrupt - * or NMI comes in, it will just have to use the default - * FTRACE_STACK_SIZE. + * Add one, for this function and the call to save_stack_trace() + * If regs is set, then these functions will not be in the way. */ - preempt_disable_notrace(); +#ifndef CONFIG_UNWINDER_ORC + if (!regs) + skip++; +#endif + + guard(preempt_notrace)(); + + stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; + + /* This should never happen. If it does, yell once and skip */ + if (WARN_ON_ONCE(stackidx >= FTRACE_KSTACK_NESTING)) + goto out; - use_stack = __this_cpu_inc_return(ftrace_stack_reserve); /* - * We don't need any atomic variables, just a barrier. - * If an interrupt comes in, we don't care, because it would - * have exited and put the counter back to what we want. - * We just need a barrier to keep gcc from moving things - * around. + * The above __this_cpu_inc_return() is 'atomic' cpu local. An + * interrupt will either see the value pre increment or post + * increment. If the interrupt happens pre increment it will have + * restored the counter when it returns. We just need a barrier to + * keep gcc from moving things around. */ barrier(); - if (use_stack == 1) { - trace.entries = &__get_cpu_var(ftrace_stack).calls[0]; - trace.max_entries = FTRACE_STACK_MAX_ENTRIES; - if (regs) - save_stack_trace_regs(regs, &trace); - else - save_stack_trace(&trace); + fstack = this_cpu_ptr(ftrace_stacks.stacks) + stackidx; + size = ARRAY_SIZE(fstack->calls); - if (trace.nr_entries > size) - size = trace.nr_entries; - } else - /* From now on, use_stack is a boolean */ - use_stack = 0; + if (regs) { + nr_entries = stack_trace_save_regs(regs, fstack->calls, + size, skip); + } else { + nr_entries = stack_trace_save(fstack->calls, size, skip); + } - size *= sizeof(unsigned long); +#ifdef CONFIG_DYNAMIC_FTRACE + /* Mark entry of stack trace as trampoline code */ + if (tr->ops && tr->ops->trampoline) { + unsigned long tramp_start = tr->ops->trampoline; + unsigned long tramp_end = tramp_start + tr->ops->trampoline_size; + unsigned long *calls = fstack->calls; + + for (int i = 0; i < nr_entries; i++) { + if (calls[i] >= tramp_start && calls[i] < tramp_end) + calls[i] = FTRACE_TRAMPOLINE_MARKER; + } + } +#endif - event = trace_buffer_lock_reserve(buffer, TRACE_STACK, - sizeof(*entry) + size, flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, + struct_size(entry, caller, nr_entries), + trace_ctx); if (!event) goto out; entry = ring_buffer_event_data(event); - memset(&entry->caller, 0, size); - - if (use_stack) - memcpy(&entry->caller, trace.entries, - trace.nr_entries * sizeof(unsigned long)); - else { - trace.max_entries = FTRACE_STACK_ENTRIES; - trace.entries = entry->caller; - if (regs) - save_stack_trace_regs(regs, &trace); - else - save_stack_trace(&trace); - } - - entry->size = trace.nr_entries; + entry->size = nr_entries; + memcpy(&entry->caller, fstack->calls, + flex_array_size(entry, caller, nr_entries)); - if (!filter_check_discard(call, entry, buffer, event)) - __buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); out: /* Again, don't let gcc optimize things here */ barrier(); __this_cpu_dec(ftrace_stack_reserve); - preempt_enable_notrace(); - } -void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, - int skip, int pc, struct pt_regs *regs) +static inline void ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, + unsigned int trace_ctx, + int skip, struct pt_regs *regs) { - if (!(trace_flags & TRACE_ITER_STACKTRACE)) + if (!(tr->trace_flags & TRACE_ITER(STACKTRACE))) return; - __ftrace_trace_stack(buffer, flags, skip, pc, regs); + __ftrace_trace_stack(tr, buffer, trace_ctx, skip, regs); } -void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, - int skip, int pc) +void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, + int skip) { - if (!(trace_flags & TRACE_ITER_STACKTRACE)) + struct trace_buffer *buffer = tr->array_buffer.buffer; + + if (rcu_is_watching()) { + __ftrace_trace_stack(tr, buffer, trace_ctx, skip, NULL); return; + } - __ftrace_trace_stack(buffer, flags, skip, pc, NULL); -} + if (WARN_ON_ONCE(IS_ENABLED(CONFIG_GENERIC_ENTRY))) + return; -void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, - int pc) -{ - __ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL); + /* + * When an NMI triggers, RCU is enabled via ct_nmi_enter(), + * but if the above rcu_is_watching() failed, then the NMI + * triggered someplace critical, and ct_irq_enter() should + * not be called from NMI. + */ + if (unlikely(in_nmi())) + return; + + ct_irq_enter_irqson(); + __ftrace_trace_stack(tr, buffer, trace_ctx, skip, NULL); + ct_irq_exit_irqson(); } /** @@ -1756,33 +3126,29 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, */ void trace_dump_stack(int skip) { - unsigned long flags; - if (tracing_disabled || tracing_selftest_running) return; - local_save_flags(flags); - - /* - * Skip 3 more, seems to get us at the caller of - * this function. - */ - skip += 3; - __ftrace_trace_stack(global_trace.trace_buffer.buffer, - flags, skip, preempt_count(), NULL); +#ifndef CONFIG_UNWINDER_ORC + /* Skip 1 to skip this function. */ + skip++; +#endif + __ftrace_trace_stack(printk_trace, printk_trace->array_buffer.buffer, + tracing_gen_ctx(), skip, NULL); } +EXPORT_SYMBOL_GPL(trace_dump_stack); +#ifdef CONFIG_USER_STACKTRACE_SUPPORT static DEFINE_PER_CPU(int, user_stack_count); -void -ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) +static void +ftrace_trace_userstack(struct trace_array *tr, + struct trace_buffer *buffer, unsigned int trace_ctx) { - struct ftrace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; - struct stack_trace trace; - if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) + if (!(tr->trace_flags & TRACE_ITER(USERSTACKTRACE))) return; /* @@ -1796,14 +3162,14 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) * prevent recursion, since the user stack tracing may * trigger other kernel events. */ - preempt_disable(); + guard(preempt)(); if (__this_cpu_read(user_stack_count)) - goto out; + return; __this_cpu_inc(user_stack_count); - event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, - sizeof(*entry), flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, + sizeof(*entry), trace_ctx); if (!event) goto out_drop_count; entry = ring_buffer_event_data(event); @@ -1811,109 +3177,102 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) entry->tgid = current->tgid; memset(&entry->caller, 0, sizeof(entry->caller)); - trace.nr_entries = 0; - trace.max_entries = FTRACE_STACK_ENTRIES; - trace.skip = 0; - trace.entries = entry->caller; - - save_stack_trace_user(&trace); - if (!filter_check_discard(call, entry, buffer, event)) - __buffer_unlock_commit(buffer, event); + stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES); + __buffer_unlock_commit(buffer, event); out_drop_count: __this_cpu_dec(user_stack_count); - out: - preempt_enable(); } - -#ifdef UNUSED -static void __trace_userstack(struct trace_array *tr, unsigned long flags) +#else /* CONFIG_USER_STACKTRACE_SUPPORT */ +static void ftrace_trace_userstack(struct trace_array *tr, + struct trace_buffer *buffer, + unsigned int trace_ctx) { - ftrace_trace_userstack(tr, flags, preempt_count()); } -#endif /* UNUSED */ +#endif /* !CONFIG_USER_STACKTRACE_SUPPORT */ #endif /* CONFIG_STACKTRACE */ +static inline void +func_repeats_set_delta_ts(struct func_repeats_entry *entry, + unsigned long long delta) +{ + entry->bottom_delta_ts = delta & U32_MAX; + entry->top_delta_ts = (delta >> 32); +} + +void trace_last_func_repeats(struct trace_array *tr, + struct trace_func_repeats *last_info, + unsigned int trace_ctx) +{ + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct func_repeats_entry *entry; + struct ring_buffer_event *event; + u64 delta; + + event = __trace_buffer_lock_reserve(buffer, TRACE_FUNC_REPEATS, + sizeof(*entry), trace_ctx); + if (!event) + return; + + delta = ring_buffer_event_time_stamp(buffer, event) - + last_info->ts_last_call; + + entry = ring_buffer_event_data(event); + entry->ip = last_info->ip; + entry->parent_ip = last_info->parent_ip; + entry->count = last_info->count; + func_repeats_set_delta_ts(entry, delta); + + __buffer_unlock_commit(buffer, event); +} + /* created for use with alloc_percpu */ struct trace_buffer_struct { - char buffer[TRACE_BUF_SIZE]; + int nesting; + char buffer[4][TRACE_BUF_SIZE]; }; -static struct trace_buffer_struct *trace_percpu_buffer; -static struct trace_buffer_struct *trace_percpu_sirq_buffer; -static struct trace_buffer_struct *trace_percpu_irq_buffer; -static struct trace_buffer_struct *trace_percpu_nmi_buffer; +static struct trace_buffer_struct __percpu *trace_percpu_buffer; /* - * The buffer used is dependent on the context. There is a per cpu - * buffer for normal context, softirq contex, hard irq context and - * for NMI context. Thise allows for lockless recording. - * - * Note, if the buffers failed to be allocated, then this returns NULL + * This allows for lockless recording. If we're nested too deeply, then + * this returns NULL. */ static char *get_trace_buf(void) { - struct trace_buffer_struct *percpu_buffer; + struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer); - /* - * If we have allocated per cpu buffers, then we do not - * need to do any locking. - */ - if (in_nmi()) - percpu_buffer = trace_percpu_nmi_buffer; - else if (in_irq()) - percpu_buffer = trace_percpu_irq_buffer; - else if (in_softirq()) - percpu_buffer = trace_percpu_sirq_buffer; - else - percpu_buffer = trace_percpu_buffer; - - if (!percpu_buffer) + if (!trace_percpu_buffer || buffer->nesting >= 4) return NULL; - return this_cpu_ptr(&percpu_buffer->buffer[0]); + buffer->nesting++; + + /* Interrupts must see nesting incremented before we use the buffer */ + barrier(); + return &buffer->buffer[buffer->nesting - 1][0]; } -static int alloc_percpu_trace_buffer(void) +static void put_trace_buf(void) { - struct trace_buffer_struct *buffers; - struct trace_buffer_struct *sirq_buffers; - struct trace_buffer_struct *irq_buffers; - struct trace_buffer_struct *nmi_buffers; - - buffers = alloc_percpu(struct trace_buffer_struct); - if (!buffers) - goto err_warn; + /* Don't let the decrement of nesting leak before this */ + barrier(); + this_cpu_dec(trace_percpu_buffer->nesting); +} - sirq_buffers = alloc_percpu(struct trace_buffer_struct); - if (!sirq_buffers) - goto err_sirq; +static int alloc_percpu_trace_buffer(void) +{ + struct trace_buffer_struct __percpu *buffers; - irq_buffers = alloc_percpu(struct trace_buffer_struct); - if (!irq_buffers) - goto err_irq; + if (trace_percpu_buffer) + return 0; - nmi_buffers = alloc_percpu(struct trace_buffer_struct); - if (!nmi_buffers) - goto err_nmi; + buffers = alloc_percpu(struct trace_buffer_struct); + if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer")) + return -ENOMEM; trace_percpu_buffer = buffers; - trace_percpu_sirq_buffer = sirq_buffers; - trace_percpu_irq_buffer = irq_buffers; - trace_percpu_nmi_buffer = nmi_buffers; - return 0; - - err_nmi: - free_percpu(irq_buffers); - err_irq: - free_percpu(sirq_buffers); - err_sirq: - free_percpu(buffers); - err_warn: - WARN(1, "Could not allocate percpu trace_printk buffer"); - return -ENOMEM; } static int buffers_allocated; @@ -1926,10 +3285,25 @@ void trace_printk_init_buffers(void) if (alloc_percpu_trace_buffer()) return; - pr_info("ftrace: Allocated trace_printk buffers\n"); + /* trace_printk() is for debug use only. Don't use it in production. */ + + pr_warn("\n"); + pr_warn("**********************************************************\n"); + pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warn("** **\n"); + pr_warn("** trace_printk() being used. Allocating extra memory. **\n"); + pr_warn("** **\n"); + pr_warn("** This means that this is a DEBUG kernel and it is **\n"); + pr_warn("** unsafe for production use. **\n"); + pr_warn("** **\n"); + pr_warn("** If you see this message and you are not debugging **\n"); + pr_warn("** the kernel, report this immediately to your vendor! **\n"); + pr_warn("** **\n"); + pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warn("**********************************************************\n"); /* Expand the buffers to set size */ - tracing_update_buffers(); + tracing_update_buffers(&global_trace); buffers_allocated = 1; @@ -1939,9 +3313,10 @@ void trace_printk_init_buffers(void) * directly here. If the global_trace.buffer is already * allocated here, then this was called by module code. */ - if (global_trace.trace_buffer.buffer) + if (global_trace.array_buffer.buffer) tracing_start_cmdline_record(); } +EXPORT_SYMBOL_GPL(trace_printk_init_buffers); void trace_printk_start_comm(void) { @@ -1964,18 +3339,22 @@ static void trace_printk_start_stop_comm(int enabled) /** * trace_vbprintk - write binary msg to tracing buffer - * + * @ip: The address of the caller + * @fmt: The string format to write to the buffer + * @args: Arguments for @fmt */ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { - struct ftrace_event_call *call = &event_bprint; struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_array *tr = &global_trace; + struct trace_buffer *buffer; + struct trace_array *tr = READ_ONCE(printk_trace); struct bprint_entry *entry; - unsigned long flags; + unsigned int trace_ctx; char *tbuffer; - int len = 0, size, pc; + int len = 0, size; + + if (!printk_binsafe(tr)) + return trace_vprintk(ip, fmt, args); if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; @@ -1983,93 +3362,90 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) /* Don't pollute graph traces with trace_vprintk internals */ pause_graph_tracing(); - pc = preempt_count(); - preempt_disable_notrace(); + trace_ctx = tracing_gen_ctx(); + guard(preempt_notrace)(); tbuffer = get_trace_buf(); if (!tbuffer) { len = 0; - goto out; + goto out_nobuffer; } len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) - goto out; + goto out_put; - local_save_flags(flags); size = sizeof(*entry) + sizeof(u32) * len; - buffer = tr->trace_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, - flags, pc); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->ip = ip; - entry->fmt = fmt; - - memcpy(entry->buf, tbuffer, sizeof(u32) * len); - if (!filter_check_discard(call, entry, buffer, event)) { + buffer = tr->array_buffer.buffer; + scoped_guard(ring_buffer_nest, buffer) { + event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, + trace_ctx); + if (!event) + goto out_put; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->fmt = fmt; + + memcpy(entry->buf, tbuffer, sizeof(u32) * len); __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, flags, 6, pc); + ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); } +out_put: + put_trace_buf(); -out: - preempt_enable_notrace(); +out_nobuffer: unpause_graph_tracing(); return len; } EXPORT_SYMBOL_GPL(trace_vbprintk); -static int -__trace_array_vprintk(struct ring_buffer *buffer, - unsigned long ip, const char *fmt, va_list args) +static __printf(3, 0) +int __trace_array_vprintk(struct trace_buffer *buffer, + unsigned long ip, const char *fmt, va_list args) { - struct ftrace_event_call *call = &event_print; struct ring_buffer_event *event; - int len = 0, size, pc; + int len = 0, size; struct print_entry *entry; - unsigned long flags; + unsigned int trace_ctx; char *tbuffer; - if (tracing_disabled || tracing_selftest_running) + if (tracing_disabled) return 0; /* Don't pollute graph traces with trace_vprintk internals */ pause_graph_tracing(); - pc = preempt_count(); - preempt_disable_notrace(); + trace_ctx = tracing_gen_ctx(); + guard(preempt_notrace)(); tbuffer = get_trace_buf(); if (!tbuffer) { len = 0; - goto out; + goto out_nobuffer; } - len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); - if (len > TRACE_BUF_SIZE) - goto out; + len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); - local_save_flags(flags); size = sizeof(*entry) + len + 1; - event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - flags, pc); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->ip = ip; + scoped_guard(ring_buffer_nest, buffer) { + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + trace_ctx); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->ip = ip; - memcpy(&entry->buf, tbuffer, len); - entry->buf[len] = '\0'; - if (!filter_check_discard(call, entry, buffer, event)) { + memcpy(&entry->buf, tbuffer, len + 1); __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, flags, 6, pc); + ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); } - out: - preempt_enable_notrace(); +out: + put_trace_buf(); + +out_nobuffer: unpause_graph_tracing(); return len; @@ -2078,16 +3454,46 @@ __trace_array_vprintk(struct ring_buffer *buffer, int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { - return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args); + if (tracing_selftest_running && tr == &global_trace) + return 0; + + return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args); } +/** + * trace_array_printk - Print a message to a specific instance + * @tr: The instance trace_array descriptor + * @ip: The instruction pointer that this is called from. + * @fmt: The format to print (printf format) + * + * If a subsystem sets up its own instance, they have the right to + * printk strings into their tracing instance buffer using this + * function. Note, this function will not write into the top level + * buffer (use trace_printk() for that), as writing into the top level + * buffer should only have events that can be individually disabled. + * trace_printk() is only used for debugging a kernel, and should not + * be ever incorporated in normal use. + * + * trace_array_printk() can be used, as it will not add noise to the + * top level tracing buffer. + * + * Note, trace_array_init_printk() must be called on @tr before this + * can be used. + */ int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...) { int ret; va_list ap; - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!tr) + return -ENOENT; + + /* This is only allowed for created instances */ + if (tr == &global_trace) + return 0; + + if (!(tr->trace_flags & TRACE_ITER(PRINTK))) return 0; va_start(ap, fmt); @@ -2095,14 +3501,36 @@ int trace_array_printk(struct trace_array *tr, va_end(ap); return ret; } +EXPORT_SYMBOL_GPL(trace_array_printk); -int trace_array_printk_buf(struct ring_buffer *buffer, +/** + * trace_array_init_printk - Initialize buffers for trace_array_printk() + * @tr: The trace array to initialize the buffers for + * + * As trace_array_printk() only writes into instances, they are OK to + * have in the kernel (unlike trace_printk()). This needs to be called + * before trace_array_printk() can be used on a trace_array. + */ +int trace_array_init_printk(struct trace_array *tr) +{ + if (!tr) + return -ENOENT; + + /* This is only allowed for created instances */ + if (tr == &global_trace) + return -EINVAL; + + return alloc_percpu_trace_buffer(); +} +EXPORT_SYMBOL_GPL(trace_array_init_printk); + +int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...) { int ret; va_list ap; - if (!(trace_flags & TRACE_ITER_PRINTK)) + if (!(printk_trace->trace_flags & TRACE_ITER(PRINTK))) return 0; va_start(ap, fmt); @@ -2113,7 +3541,7 @@ int trace_array_printk_buf(struct ring_buffer *buffer, int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { - return trace_array_vprintk(&global_trace, ip, fmt, args); + return trace_array_vprintk(printk_trace, ip, fmt, args); } EXPORT_SYMBOL_GPL(trace_vprintk); @@ -2123,7 +3551,7 @@ static void trace_iterator_increment(struct trace_iterator *iter) iter->idx++; if (buf_iter) - ring_buffer_read(buf_iter, NULL); + ring_buffer_iter_advance(buf_iter); } static struct trace_entry * @@ -2133,11 +3561,15 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, struct ring_buffer_event *event; struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu); - if (buf_iter) + if (buf_iter) { event = ring_buffer_iter_peek(buf_iter, ts); - else - event = ring_buffer_peek(iter->trace_buffer->buffer, cpu, ts, + if (lost_events) + *lost_events = ring_buffer_iter_dropped(buf_iter) ? + (unsigned long)-1 : 0; + } else { + event = ring_buffer_peek(iter->array_buffer->buffer, cpu, ts, lost_events); + } if (event) { iter->ent_size = ring_buffer_event_length(event); @@ -2151,7 +3583,7 @@ static struct trace_entry * __find_next_entry(struct trace_iterator *iter, int *ent_cpu, unsigned long *missing_events, u64 *ent_ts) { - struct ring_buffer *buffer = iter->trace_buffer->buffer; + struct trace_buffer *buffer = iter->array_buffer->buffer; struct trace_entry *ent, *next = NULL; unsigned long lost_events = 0, next_lost = 0; int cpu_file = iter->cpu_file; @@ -2207,11 +3639,245 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, return next; } +#define STATIC_FMT_BUF_SIZE 128 +static char static_fmt_buf[STATIC_FMT_BUF_SIZE]; + +char *trace_iter_expand_format(struct trace_iterator *iter) +{ + char *tmp; + + /* + * iter->tr is NULL when used with tp_printk, which makes + * this get called where it is not safe to call krealloc(). + */ + if (!iter->tr || iter->fmt == static_fmt_buf) + return NULL; + + tmp = krealloc(iter->fmt, iter->fmt_size + STATIC_FMT_BUF_SIZE, + GFP_KERNEL); + if (tmp) { + iter->fmt_size += STATIC_FMT_BUF_SIZE; + iter->fmt = tmp; + } + + return tmp; +} + +/* Returns true if the string is safe to dereference from an event */ +static bool trace_safe_str(struct trace_iterator *iter, const char *str) +{ + unsigned long addr = (unsigned long)str; + struct trace_event *trace_event; + struct trace_event_call *event; + + /* OK if part of the event data */ + if ((addr >= (unsigned long)iter->ent) && + (addr < (unsigned long)iter->ent + iter->ent_size)) + return true; + + /* OK if part of the temp seq buffer */ + if ((addr >= (unsigned long)iter->tmp_seq.buffer) && + (addr < (unsigned long)iter->tmp_seq.buffer + TRACE_SEQ_BUFFER_SIZE)) + return true; + + /* Core rodata can not be freed */ + if (is_kernel_rodata(addr)) + return true; + + if (trace_is_tracepoint_string(str)) + return true; + + /* + * Now this could be a module event, referencing core module + * data, which is OK. + */ + if (!iter->ent) + return false; + + trace_event = ftrace_find_event(iter->ent->type); + if (!trace_event) + return false; + + event = container_of(trace_event, struct trace_event_call, event); + if ((event->flags & TRACE_EVENT_FL_DYNAMIC) || !event->module) + return false; + + /* Would rather have rodata, but this will suffice */ + if (within_module_core(addr, event->module)) + return true; + + return false; +} + +/** + * ignore_event - Check dereferenced fields while writing to the seq buffer + * @iter: The iterator that holds the seq buffer and the event being printed + * + * At boot up, test_event_printk() will flag any event that dereferences + * a string with "%s" that does exist in the ring buffer. It may still + * be valid, as the string may point to a static string in the kernel + * rodata that never gets freed. But if the string pointer is pointing + * to something that was allocated, there's a chance that it can be freed + * by the time the user reads the trace. This would cause a bad memory + * access by the kernel and possibly crash the system. + * + * This function will check if the event has any fields flagged as needing + * to be checked at runtime and perform those checks. + * + * If it is found that a field is unsafe, it will write into the @iter->seq + * a message stating what was found to be unsafe. + * + * @return: true if the event is unsafe and should be ignored, + * false otherwise. + */ +bool ignore_event(struct trace_iterator *iter) +{ + struct ftrace_event_field *field; + struct trace_event *trace_event; + struct trace_event_call *event; + struct list_head *head; + struct trace_seq *seq; + const void *ptr; + + trace_event = ftrace_find_event(iter->ent->type); + + seq = &iter->seq; + + if (!trace_event) { + trace_seq_printf(seq, "EVENT ID %d NOT FOUND?\n", iter->ent->type); + return true; + } + + event = container_of(trace_event, struct trace_event_call, event); + if (!(event->flags & TRACE_EVENT_FL_TEST_STR)) + return false; + + head = trace_get_fields(event); + if (!head) { + trace_seq_printf(seq, "FIELDS FOR EVENT '%s' NOT FOUND?\n", + trace_event_name(event)); + return true; + } + + /* Offsets are from the iter->ent that points to the raw event */ + ptr = iter->ent; + + list_for_each_entry(field, head, link) { + const char *str; + bool good; + + if (!field->needs_test) + continue; + + str = *(const char **)(ptr + field->offset); + + good = trace_safe_str(iter, str); + + /* + * If you hit this warning, it is likely that the + * trace event in question used %s on a string that + * was saved at the time of the event, but may not be + * around when the trace is read. Use __string(), + * __assign_str() and __get_str() helpers in the TRACE_EVENT() + * instead. See samples/trace_events/trace-events-sample.h + * for reference. + */ + if (WARN_ONCE(!good, "event '%s' has unsafe pointer field '%s'", + trace_event_name(event), field->name)) { + trace_seq_printf(seq, "EVENT %s: HAS UNSAFE POINTER FIELD '%s'\n", + trace_event_name(event), field->name); + return true; + } + } + return false; +} + +const char *trace_event_format(struct trace_iterator *iter, const char *fmt) +{ + const char *p, *new_fmt; + char *q; + + if (WARN_ON_ONCE(!fmt)) + return fmt; + + if (!iter->tr || iter->tr->trace_flags & TRACE_ITER(HASH_PTR)) + return fmt; + + p = fmt; + new_fmt = q = iter->fmt; + while (*p) { + if (unlikely(q - new_fmt + 3 > iter->fmt_size)) { + if (!trace_iter_expand_format(iter)) + return fmt; + + q += iter->fmt - new_fmt; + new_fmt = iter->fmt; + } + + *q++ = *p++; + + /* Replace %p with %px */ + if (p[-1] == '%') { + if (p[0] == '%') { + *q++ = *p++; + } else if (p[0] == 'p' && !isalnum(p[1])) { + *q++ = *p++; + *q++ = 'x'; + } + } + } + *q = '\0'; + + return new_fmt; +} + +#define STATIC_TEMP_BUF_SIZE 128 +static char static_temp_buf[STATIC_TEMP_BUF_SIZE] __aligned(4); + /* Find the next real entry, without updating the iterator itself */ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) { - return __find_next_entry(iter, ent_cpu, NULL, ent_ts); + /* __find_next_entry will reset ent_size */ + int ent_size = iter->ent_size; + struct trace_entry *entry; + + /* + * If called from ftrace_dump(), then the iter->temp buffer + * will be the static_temp_buf and not created from kmalloc. + * If the entry size is greater than the buffer, we can + * not save it. Just return NULL in that case. This is only + * used to add markers when two consecutive events' time + * stamps have a large delta. See trace_print_lat_context() + */ + if (iter->temp == static_temp_buf && + STATIC_TEMP_BUF_SIZE < ent_size) + return NULL; + + /* + * The __find_next_entry() may call peek_next_entry(), which may + * call ring_buffer_peek() that may make the contents of iter->ent + * undefined. Need to copy iter->ent now. + */ + if (iter->ent && iter->ent != iter->temp) { + if ((!iter->temp || iter->temp_size < iter->ent_size) && + !WARN_ON_ONCE(iter->temp == static_temp_buf)) { + void *temp; + temp = kmalloc(iter->ent_size, GFP_KERNEL); + if (!temp) + return NULL; + kfree(iter->temp); + iter->temp = temp; + iter->temp_size = iter->ent_size; + } + memcpy(iter->temp, iter->ent, iter->ent_size); + iter->ent = iter->temp; + } + entry = __find_next_entry(iter, ent_cpu, NULL, ent_ts); + /* Put back the original ent_size */ + iter->ent_size = ent_size; + + return entry; } /* Find the next real entry, and increment the iterator to the next entry */ @@ -2228,7 +3894,7 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter) static void trace_consume(struct trace_iterator *iter) { - ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, &iter->ts, + ring_buffer_consume(iter->array_buffer->buffer, iter->cpu, &iter->ts, &iter->lost_events); } @@ -2261,12 +3927,11 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) void tracing_iter_reset(struct trace_iterator *iter, int cpu) { - struct ring_buffer_event *event; struct ring_buffer_iter *buf_iter; unsigned long entries = 0; u64 ts; - per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0; + per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = 0; buf_iter = trace_buffer_iter(iter, cpu); if (!buf_iter) @@ -2279,14 +3944,16 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) * that a reset never took place on a cpu. This is evident * by the timestamp being before the start of the buffer. */ - while ((event = ring_buffer_iter_peek(buf_iter, &ts))) { - if (ts >= iter->trace_buffer->time_start) + while (ring_buffer_iter_peek(buf_iter, &ts)) { + if (ts >= iter->array_buffer->time_start) break; entries++; - ring_buffer_read(buf_iter, NULL); + ring_buffer_iter_advance(buf_iter); + /* This could be a big loop */ + cond_resched(); } - per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries; + per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries; } /* @@ -2302,15 +3969,16 @@ static void *s_start(struct seq_file *m, loff_t *pos) loff_t l = 0; int cpu; - /* - * copy the tracer to avoid using a global lock all around. - * iter->trace is a copy of current_trace, the pointer to the - * name may be used instead of a strcmp(), as iter->trace->name - * will point to the same string as current_trace->name. - */ mutex_lock(&trace_types_lock); - if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) - *iter->trace = *tr->current_trace; + if (unlikely(tr->current_trace != iter->trace)) { + /* Close iter->trace before switching to the new current tracer */ + if (iter->trace->close) + iter->trace->close(iter); + iter->trace = tr->current_trace; + /* Reopen the new current tracer */ + if (iter->trace->open) + iter->trace->open(iter); + } mutex_unlock(&trace_types_lock); #ifdef CONFIG_TRACER_MAX_TRACE @@ -2318,9 +3986,6 @@ static void *s_start(struct seq_file *m, loff_t *pos) return ERR_PTR(-EBUSY); #endif - if (!iter->snapshot) - atomic_inc(&trace_record_cmdline_disabled); - if (*pos != iter->pos) { iter->ent = NULL; iter->cpu = 0; @@ -2363,54 +4028,87 @@ static void s_stop(struct seq_file *m, void *p) return; #endif - if (!iter->snapshot) - atomic_dec(&trace_record_cmdline_disabled); - trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); } static void -get_total_entries(struct trace_buffer *buf, - unsigned long *total, unsigned long *entries) +get_total_entries_cpu(struct array_buffer *buf, unsigned long *total, + unsigned long *entries, int cpu) { unsigned long count; + + count = ring_buffer_entries_cpu(buf->buffer, cpu); + /* + * If this buffer has skipped entries, then we hold all + * entries for the trace and we need to ignore the + * ones before the time stamp. + */ + if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { + count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; + /* total is the same as the entries */ + *total = count; + } else + *total = count + + ring_buffer_overrun_cpu(buf->buffer, cpu); + *entries = count; +} + +static void +get_total_entries(struct array_buffer *buf, + unsigned long *total, unsigned long *entries) +{ + unsigned long t, e; int cpu; *total = 0; *entries = 0; for_each_tracing_cpu(cpu) { - count = ring_buffer_entries_cpu(buf->buffer, cpu); - /* - * If this buffer has skipped entries, then we hold all - * entries for the trace and we need to ignore the - * ones before the time stamp. - */ - if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { - count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; - /* total is the same as the entries */ - *total += count; - } else - *total += count + - ring_buffer_overrun_cpu(buf->buffer, cpu); - *entries += count; + get_total_entries_cpu(buf, &t, &e, cpu); + *total += t; + *entries += e; } } +unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu) +{ + unsigned long total, entries; + + if (!tr) + tr = &global_trace; + + get_total_entries_cpu(&tr->array_buffer, &total, &entries, cpu); + + return entries; +} + +unsigned long trace_total_entries(struct trace_array *tr) +{ + unsigned long total, entries; + + if (!tr) + tr = &global_trace; + + get_total_entries(&tr->array_buffer, &total, &entries); + + return entries; +} + static void print_lat_help_header(struct seq_file *m) { - seq_puts(m, "# _------=> CPU# \n"); - seq_puts(m, "# / _-----=> irqs-off \n"); - seq_puts(m, "# | / _----=> need-resched \n"); - seq_puts(m, "# || / _---=> hardirq/softirq \n"); - seq_puts(m, "# ||| / _--=> preempt-depth \n"); - seq_puts(m, "# |||| / delay \n"); - seq_puts(m, "# cmd pid ||||| time | caller \n"); - seq_puts(m, "# \\ / ||||| \\ | / \n"); + seq_puts(m, "# _------=> CPU# \n" + "# / _-----=> irqs-off/BH-disabled\n" + "# | / _----=> need-resched \n" + "# || / _---=> hardirq/softirq \n" + "# ||| / _--=> preempt-depth \n" + "# |||| / _-=> migrate-disable \n" + "# ||||| / delay \n" + "# cmd pid |||||| time | caller \n" + "# \\ / |||||| \\ | / \n"); } -static void print_event_info(struct trace_buffer *buf, struct seq_file *m) +static void print_event_info(struct array_buffer *buf, struct seq_file *m) { unsigned long total; unsigned long entries; @@ -2421,42 +4119,51 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m) seq_puts(m, "#\n"); } -static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) +static void print_func_help_header(struct array_buffer *buf, struct seq_file *m, + unsigned int flags) { + bool tgid = flags & TRACE_ITER(RECORD_TGID); + print_event_info(buf, m); - seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); - seq_puts(m, "# | | | | |\n"); + + seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? " TGID " : ""); + seq_printf(m, "# | | %s | | |\n", tgid ? " | " : ""); } -static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) +static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file *m, + unsigned int flags) { + bool tgid = flags & TRACE_ITER(RECORD_TGID); + static const char space[] = " "; + int prec = tgid ? 12 : 2; + print_event_info(buf, m); - seq_puts(m, "# _-----=> irqs-off\n"); - seq_puts(m, "# / _----=> need-resched\n"); - seq_puts(m, "# | / _---=> hardirq/softirq\n"); - seq_puts(m, "# || / _--=> preempt-depth\n"); - seq_puts(m, "# ||| / delay\n"); - seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"); - seq_puts(m, "# | | | |||| | |\n"); + + seq_printf(m, "# %.*s _-----=> irqs-off/BH-disabled\n", prec, space); + seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); + seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); + seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); + seq_printf(m, "# %.*s||| / _-=> migrate-disable\n", prec, space); + seq_printf(m, "# %.*s|||| / delay\n", prec, space); + seq_printf(m, "# TASK-PID %.*s CPU# ||||| TIMESTAMP FUNCTION\n", prec, " TGID "); + seq_printf(m, "# | | %.*s | ||||| | |\n", prec, " | "); } void print_trace_header(struct seq_file *m, struct trace_iterator *iter) { - unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); - struct trace_buffer *buf = iter->trace_buffer; + unsigned long sym_flags = (global_trace.trace_flags & TRACE_ITER_SYM_MASK); + struct array_buffer *buf = iter->array_buffer; struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu); struct tracer *type = iter->trace; unsigned long entries; unsigned long total; - const char *name = "preemption"; - - name = type->name; + const char *name = type->name; get_total_entries(buf, &total, &entries); seq_printf(m, "# %s latency trace v1.1.5 on %s\n", - name, UTS_RELEASE); + name, init_utsname()->release); seq_puts(m, "# -----------------------------------" "---------------------------------\n"); seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |" @@ -2465,15 +4172,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) entries, total, buf->cpu, -#if defined(CONFIG_PREEMPT_NONE) - "server", -#elif defined(CONFIG_PREEMPT_VOLUNTARY) - "desktop", -#elif defined(CONFIG_PREEMPT) - "preempt", -#else - "unknown", -#endif + preempt_model_str(), /* These are reserved for later use */ 0, 0, 0, 0); #ifdef CONFIG_SMP @@ -2505,20 +4204,23 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) static void test_cpu_buff_start(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; + struct trace_array *tr = iter->tr; - if (!(trace_flags & TRACE_ITER_ANNOTATE)) + if (!(tr->trace_flags & TRACE_ITER(ANNOTATE))) return; if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) return; - if (cpumask_test_cpu(iter->cpu, iter->started)) + if (cpumask_available(iter->started) && + cpumask_test_cpu(iter->cpu, iter->started)) return; - if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries) + if (per_cpu_ptr(iter->array_buffer->data, iter->cpu)->skipped_entries) return; - cpumask_set_cpu(iter->cpu, iter->started); + if (cpumask_available(iter->started)) + cpumask_set_cpu(iter->cpu, iter->started); /* Don't print started cpu buffer for the first entry of the trace */ if (iter->idx > 1) @@ -2526,10 +4228,27 @@ static void test_cpu_buff_start(struct trace_iterator *iter) iter->cpu); } +#ifdef CONFIG_FTRACE_SYSCALLS +static bool is_syscall_event(struct trace_event *event) +{ + return (event->funcs == &enter_syscall_print_funcs) || + (event->funcs == &exit_syscall_print_funcs); + +} +#define syscall_buf_size CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT +#else +static inline bool is_syscall_event(struct trace_event *event) +{ + return false; +} +#define syscall_buf_size 0 +#endif /* CONFIG_FTRACE_SYSCALLS */ + static enum print_line_t print_trace_fmt(struct trace_iterator *iter) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; - unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + unsigned long sym_flags = (tr->trace_flags & TRACE_ITER_SYM_MASK); struct trace_entry *entry; struct trace_event *event; @@ -2539,55 +4258,66 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) event = ftrace_find_event(entry->type); - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - if (iter->iter_flags & TRACE_FILE_LAT_FMT) { - if (!trace_print_lat_context(iter)) - goto partial; - } else { - if (!trace_print_context(iter)) - goto partial; - } + if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) { + if (iter->iter_flags & TRACE_FILE_LAT_FMT) + trace_print_lat_context(iter); + else + trace_print_context(iter); } - if (event) + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; + + if (event) { + if (tr->trace_flags & TRACE_ITER(FIELDS)) + return print_event_fields(iter, event); + /* + * For TRACE_EVENT() events, the print_fmt is not + * safe to use if the array has delta offsets + * Force printing via the fields. + */ + if ((tr->text_delta)) { + /* ftrace and system call events are still OK */ + if ((event->type > __TRACE_LAST_TYPE) && + !is_syscall_event(event)) + return print_event_fields(iter, event); + } return event->funcs->trace(iter, sym_flags, event); + } - if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) - goto partial; + trace_seq_printf(s, "Unknown type %d\n", entry->type); - return TRACE_TYPE_HANDLED; -partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static enum print_line_t print_raw_fmt(struct trace_iterator *iter) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *entry; struct trace_event *event; entry = iter->ent; - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - if (!trace_seq_printf(s, "%d %d %llu ", - entry->pid, iter->cpu, iter->ts)) - goto partial; - } + if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) + trace_seq_printf(s, "%d %d %llu ", + entry->pid, iter->cpu, iter->ts); + + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; event = ftrace_find_event(entry->type); if (event) return event->funcs->raw(iter, 0, event); - if (!trace_seq_printf(s, "%d ?\n", entry->type)) - goto partial; + trace_seq_printf(s, "%d ?\n", entry->type); - return TRACE_TYPE_HANDLED; -partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static enum print_line_t print_hex_fmt(struct trace_iterator *iter) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; unsigned char newline = '\n'; struct trace_entry *entry; @@ -2595,10 +4325,12 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) entry = iter->ent; - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - SEQ_PUT_HEX_FIELD_RET(s, entry->pid); - SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); - SEQ_PUT_HEX_FIELD_RET(s, iter->ts); + if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) { + SEQ_PUT_HEX_FIELD(s, entry->pid); + SEQ_PUT_HEX_FIELD(s, iter->cpu); + SEQ_PUT_HEX_FIELD(s, iter->ts); + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; } event = ftrace_find_event(entry->type); @@ -2608,23 +4340,26 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) return ret; } - SEQ_PUT_FIELD_RET(s, newline); + SEQ_PUT_FIELD(s, newline); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static enum print_line_t print_bin_fmt(struct trace_iterator *iter) { + struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *entry; struct trace_event *event; entry = iter->ent; - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - SEQ_PUT_FIELD_RET(s, entry->pid); - SEQ_PUT_FIELD_RET(s, iter->cpu); - SEQ_PUT_FIELD_RET(s, iter->ts); + if (tr->trace_flags & TRACE_ITER(CONTEXT_INFO)) { + SEQ_PUT_FIELD(s, entry->pid); + SEQ_PUT_FIELD(s, iter->cpu); + SEQ_PUT_FIELD(s, iter->ts); + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; } event = ftrace_find_event(entry->type); @@ -2645,7 +4380,7 @@ int trace_empty(struct trace_iterator *iter) if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { - if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu)) + if (!ring_buffer_empty_cpu(iter->array_buffer->buffer, cpu)) return 0; } return 1; @@ -2657,7 +4392,7 @@ int trace_empty(struct trace_iterator *iter) if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { - if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu)) + if (!ring_buffer_empty_cpu(iter->array_buffer->buffer, cpu)) return 0; } } @@ -2668,12 +4403,20 @@ int trace_empty(struct trace_iterator *iter) /* Called with trace_event_read_lock() held. */ enum print_line_t print_trace_line(struct trace_iterator *iter) { + struct trace_array *tr = iter->tr; + unsigned long trace_flags = tr->trace_flags; enum print_line_t ret; - if (iter->lost_events && - !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", - iter->cpu, iter->lost_events)) - return TRACE_TYPE_PARTIAL_LINE; + if (iter->lost_events) { + if (iter->lost_events == (unsigned long)-1) + trace_seq_printf(&iter->seq, "CPU:%d [LOST EVENTS]\n", + iter->cpu); + else + trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", + iter->cpu, iter->lost_events); + if (trace_seq_has_overflowed(&iter->seq)) + return TRACE_TYPE_PARTIAL_LINE; + } if (iter->trace && iter->trace->print_line) { ret = iter->trace->print_line(iter); @@ -2682,27 +4425,27 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) } if (iter->ent->type == TRACE_BPUTS && - trace_flags & TRACE_ITER_PRINTK && - trace_flags & TRACE_ITER_PRINTK_MSGONLY) + trace_flags & TRACE_ITER(PRINTK) && + trace_flags & TRACE_ITER(PRINTK_MSGONLY)) return trace_print_bputs_msg_only(iter); if (iter->ent->type == TRACE_BPRINT && - trace_flags & TRACE_ITER_PRINTK && - trace_flags & TRACE_ITER_PRINTK_MSGONLY) + trace_flags & TRACE_ITER(PRINTK) && + trace_flags & TRACE_ITER(PRINTK_MSGONLY)) return trace_print_bprintk_msg_only(iter); if (iter->ent->type == TRACE_PRINT && - trace_flags & TRACE_ITER_PRINTK && - trace_flags & TRACE_ITER_PRINTK_MSGONLY) + trace_flags & TRACE_ITER(PRINTK) && + trace_flags & TRACE_ITER(PRINTK_MSGONLY)) return trace_print_printk_msg_only(iter); - if (trace_flags & TRACE_ITER_BIN) + if (trace_flags & TRACE_ITER(BIN)) return print_bin_fmt(iter); - if (trace_flags & TRACE_ITER_HEX) + if (trace_flags & TRACE_ITER(HEX)) return print_hex_fmt(iter); - if (trace_flags & TRACE_ITER_RAW) + if (trace_flags & TRACE_ITER(RAW)) return print_raw_fmt(iter); return print_trace_fmt(iter); @@ -2711,6 +4454,7 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) void trace_latency_header(struct seq_file *m) { struct trace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; /* print nothing if the buffers are empty */ if (trace_empty(iter)) @@ -2719,15 +4463,17 @@ void trace_latency_header(struct seq_file *m) if (iter->iter_flags & TRACE_FILE_LAT_FMT) print_trace_header(m, iter); - if (!(trace_flags & TRACE_ITER_VERBOSE)) + if (!(tr->trace_flags & TRACE_ITER(VERBOSE))) print_lat_help_header(m); } void trace_default_header(struct seq_file *m) { struct trace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; + unsigned long trace_flags = tr->trace_flags; - if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + if (!(trace_flags & TRACE_ITER(CONTEXT_INFO))) return; if (iter->iter_flags & TRACE_FILE_LAT_FMT) { @@ -2735,14 +4481,16 @@ void trace_default_header(struct seq_file *m) if (trace_empty(iter)) return; print_trace_header(m, iter); - if (!(trace_flags & TRACE_ITER_VERBOSE)) + if (!(trace_flags & TRACE_ITER(VERBOSE))) print_lat_help_header(m); } else { - if (!(trace_flags & TRACE_ITER_VERBOSE)) { - if (trace_flags & TRACE_ITER_IRQ_INFO) - print_func_help_header_irq(iter->trace_buffer, m); + if (!(trace_flags & TRACE_ITER(VERBOSE))) { + if (trace_flags & TRACE_ITER(IRQ_INFO)) + print_func_help_header_irq(iter->array_buffer, + m, trace_flags); else - print_func_help_header(iter->trace_buffer, m); + print_func_help_header(iter->array_buffer, m, + trace_flags); } } } @@ -2751,44 +4499,44 @@ static void test_ftrace_alive(struct seq_file *m) { if (!ftrace_is_dead()) return; - seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"); - seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); + seq_puts(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n" + "# MAY BE MISSING FUNCTION EVENTS\n"); } #ifdef CONFIG_TRACER_MAX_TRACE static void show_snapshot_main_help(struct seq_file *m) { - seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); - seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); - seq_printf(m, "# Takes a snapshot of the main buffer.\n"); - seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n"); - seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); - seq_printf(m, "# is not a '0' or '1')\n"); + seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n" + "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" + "# Takes a snapshot of the main buffer.\n" + "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n" + "# (Doesn't have to be '2' works with any number that\n" + "# is not a '0' or '1')\n"); } static void show_snapshot_percpu_help(struct seq_file *m) { - seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); + seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP - seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); - seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n"); + seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" + "# Takes a snapshot of the main buffer for this cpu.\n"); #else - seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n"); - seq_printf(m, "# Must use main snapshot file to allocate.\n"); + seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n" + "# Must use main snapshot file to allocate.\n"); #endif - seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"); - seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); - seq_printf(m, "# is not a '0' or '1')\n"); + seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n" + "# (Doesn't have to be '2' works with any number that\n" + "# is not a '0' or '1')\n"); } static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { if (iter->tr->allocated_snapshot) - seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); + seq_puts(m, "#\n# * Snapshot is allocated *\n#\n"); else - seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); + seq_puts(m, "#\n# * Snapshot is freed *\n#\n"); - seq_printf(m, "# Snapshot commands:\n"); + seq_puts(m, "# Snapshot commands:\n"); if (iter->cpu_file == RING_BUFFER_ALL_CPUS) show_snapshot_main_help(m); else @@ -2828,7 +4576,11 @@ static int s_show(struct seq_file *m, void *v) iter->leftover = ret; } else { - print_trace_line(iter); + ret = print_trace_line(iter); + if (ret == TRACE_TYPE_PARTIAL_LINE) { + iter->seq.full = 0; + trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n"); + } ret = trace_print_seq(m, &iter->seq); /* * If we overflow the seq_file buffer, then it will @@ -2843,6 +4595,17 @@ static int s_show(struct seq_file *m, void *v) return 0; } +/* + * Should be used after trace_array_get(), trace_types_lock + * ensures that i_cdev was already initialized. + */ +static inline int tracing_get_cpu(struct inode *inode) +{ + if (inode->i_cdev) /* See trace_create_cpu_file() */ + return (long)inode->i_cdev - 1; + return RING_BUFFER_ALL_CPUS; +} + static const struct seq_operations tracer_seq_ops = { .start = s_start, .next = s_next, @@ -2850,10 +4613,29 @@ static const struct seq_operations tracer_seq_ops = { .show = s_show, }; +/* + * Note, as iter itself can be allocated and freed in different + * ways, this function is only used to free its content, and not + * the iterator itself. The only requirement to all the allocations + * is that it must zero all fields (kzalloc), as freeing works with + * ethier allocated content or NULL. + */ +static void free_trace_iter_content(struct trace_iterator *iter) +{ + /* The fmt is either NULL, allocated or points to static_fmt_buf */ + if (iter->fmt != static_fmt_buf) + kfree(iter->fmt); + + kfree(iter->temp); + kfree(iter->buffer_iter); + mutex_destroy(&iter->mutex); + free_cpumask_var(iter->started); +} + static struct trace_iterator * -__tracing_open(struct trace_array *tr, struct trace_cpu *tc, - struct inode *inode, struct file *file, bool snapshot) +__tracing_open(struct inode *inode, struct file *file, bool snapshot) { + struct trace_array *tr = inode->i_private; struct trace_iterator *iter; int cpu; @@ -2864,21 +4646,35 @@ __tracing_open(struct trace_array *tr, struct trace_cpu *tc, if (!iter) return ERR_PTR(-ENOMEM); - iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(), + iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter), GFP_KERNEL); if (!iter->buffer_iter) goto release; /* - * We make a copy of the current tracer to avoid concurrent - * changes on it while we are reading. + * trace_find_next_entry() may need to save off iter->ent. + * It will place it into the iter->temp buffer. As most + * events are less than 128, allocate a buffer of that size. + * If one is greater, then trace_find_next_entry() will + * allocate a new buffer to adjust for the bigger iter->ent. + * It's not critical if it fails to get allocated here. */ - mutex_lock(&trace_types_lock); - iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL); - if (!iter->trace) - goto fail; + iter->temp = kmalloc(128, GFP_KERNEL); + if (iter->temp) + iter->temp_size = 128; + + /* + * trace_event_printf() may need to modify given format + * string to replace %p with %px so that it shows real address + * instead of hash value. However, that is only for the event + * tracing, other tracer may not need. Defer the allocation + * until it is needed. + */ + iter->fmt = NULL; + iter->fmt_size = 0; - *iter->trace = *tr->current_trace; + mutex_lock(&trace_types_lock); + iter->trace = tr->current_trace; if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; @@ -2888,47 +4684,48 @@ __tracing_open(struct trace_array *tr, struct trace_cpu *tc, #ifdef CONFIG_TRACER_MAX_TRACE /* Currently only the top directory has a snapshot */ if (tr->current_trace->print_max || snapshot) - iter->trace_buffer = &tr->max_buffer; + iter->array_buffer = &tr->max_buffer; else #endif - iter->trace_buffer = &tr->trace_buffer; + iter->array_buffer = &tr->array_buffer; iter->snapshot = snapshot; iter->pos = -1; + iter->cpu_file = tracing_get_cpu(inode); mutex_init(&iter->mutex); - iter->cpu_file = tc->cpu; /* Notify the tracer early; before we stop tracing. */ - if (iter->trace && iter->trace->open) + if (iter->trace->open) iter->trace->open(iter); /* Annotate start of buffers if we had overruns */ - if (ring_buffer_overruns(iter->trace_buffer->buffer)) + if (ring_buffer_overruns(iter->array_buffer->buffer)) iter->iter_flags |= TRACE_FILE_ANNOTATE; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ if (trace_clocks[tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; - /* stop the trace while dumping if we are not opening "snapshot" */ - if (!iter->snapshot) + /* + * If pause-on-trace is enabled, then stop the trace while + * dumping, unless this is the "snapshot" file + */ + if (!iter->snapshot && (tr->trace_flags & TRACE_ITER(PAUSE_ON_TRACE))) { + iter->iter_flags |= TRACE_FILE_PAUSE; tracing_stop_tr(tr); + } if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); - } - ring_buffer_read_prepare_sync(); - for_each_tracing_cpu(cpu) { - ring_buffer_read_start(iter->buffer_iter[cpu]); + ring_buffer_read_start(iter->array_buffer->buffer, + cpu, GFP_KERNEL); tracing_iter_reset(iter, cpu); } } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); - ring_buffer_read_prepare_sync(); - ring_buffer_read_start(iter->buffer_iter[cpu]); + ring_buffer_read_start(iter->array_buffer->buffer, + cpu, GFP_KERNEL); tracing_iter_reset(iter, cpu); } @@ -2938,8 +4735,7 @@ __tracing_open(struct trace_array *tr, struct trace_cpu *tc, fail: mutex_unlock(&trace_types_lock); - kfree(iter->trace); - kfree(iter->buffer_iter); + free_trace_iter_content(iter); release: seq_release_private(inode, file); return ERR_PTR(-ENOMEM); @@ -2947,69 +4743,97 @@ release: int tracing_open_generic(struct inode *inode, struct file *filp) { - if (tracing_disabled) - return -ENODEV; + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; filp->private_data = inode->i_private; return 0; } +bool tracing_is_disabled(void) +{ + return (tracing_disabled) ? true: false; +} + /* * Open and update trace_array ref count. * Must have the current trace_array passed to it. */ -static int tracing_open_generic_tr(struct inode *inode, struct file *filp) +int tracing_open_generic_tr(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; + int ret; - if (tracing_disabled) - return -ENODEV; - - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; filp->private_data = inode->i_private; return 0; - } -static int tracing_open_generic_tc(struct inode *inode, struct file *filp) +/* + * The private pointer of the inode is the trace_event_file. + * Update the tr ref count associated to it. + */ +int tracing_open_file_tr(struct inode *inode, struct file *filp) { - struct trace_cpu *tc = inode->i_private; - struct trace_array *tr = tc->tr; + struct trace_event_file *file = inode->i_private; + int ret; - if (tracing_disabled) - return -ENODEV; + ret = tracing_check_open_get_tr(file->tr); + if (ret) + return ret; + + guard(mutex)(&event_mutex); - if (trace_array_get(tr) < 0) + /* Fail if the file is marked for removal */ + if (file->flags & EVENT_FILE_FL_FREED) { + trace_array_put(file->tr); return -ENODEV; + } else { + event_file_get(file); + } filp->private_data = inode->i_private; return 0; - +} + +int tracing_release_file_tr(struct inode *inode, struct file *filp) +{ + struct trace_event_file *file = inode->i_private; + + trace_array_put(file->tr); + event_file_put(file); + + return 0; +} + +int tracing_single_release_file_tr(struct inode *inode, struct file *filp) +{ + tracing_release_file_tr(inode, filp); + return single_release(inode, filp); } static int tracing_release(struct inode *inode, struct file *file) { + struct trace_array *tr = inode->i_private; struct seq_file *m = file->private_data; struct trace_iterator *iter; - struct trace_array *tr; int cpu; - /* Writes do not use seq_file, need to grab tr from inode */ if (!(file->f_mode & FMODE_READ)) { - struct trace_cpu *tc = inode->i_private; - - trace_array_put(tc->tr); + trace_array_put(tr); return 0; } + /* Writes do not use seq_file */ iter = m->private; - tr = iter->tr; - trace_array_put(tr); - mutex_lock(&trace_types_lock); for_each_tracing_cpu(cpu) { @@ -3020,21 +4844,21 @@ static int tracing_release(struct inode *inode, struct file *file) if (iter->trace && iter->trace->close) iter->trace->close(iter); - if (!iter->snapshot) + if (iter->iter_flags & TRACE_FILE_PAUSE) /* reenable tracing if it was previously enabled */ tracing_start_tr(tr); + + __trace_array_put(tr); + mutex_unlock(&trace_types_lock); - mutex_destroy(&iter->mutex); - free_cpumask_var(iter->started); - kfree(iter->trace); - kfree(iter->buffer_iter); + free_trace_iter_content(iter); seq_release_private(inode, file); return 0; } -static int tracing_release_generic_tr(struct inode *inode, struct file *file) +int tracing_release_generic_tr(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; @@ -3042,15 +4866,6 @@ static int tracing_release_generic_tr(struct inode *inode, struct file *file) return 0; } -static int tracing_release_generic_tc(struct inode *inode, struct file *file) -{ - struct trace_cpu *tc = inode->i_private; - struct trace_array *tr = tc->tr; - - trace_array_put(tr); - return 0; -} - static int tracing_single_release_tr(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; @@ -3062,28 +4877,35 @@ static int tracing_single_release_tr(struct inode *inode, struct file *file) static int tracing_open(struct inode *inode, struct file *file) { - struct trace_cpu *tc = inode->i_private; - struct trace_array *tr = tc->tr; + struct trace_array *tr = inode->i_private; struct trace_iterator *iter; - int ret = 0; + int ret; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; /* If this file was open for write, then erase contents */ - if ((file->f_mode & FMODE_WRITE) && - (file->f_flags & O_TRUNC)) { - if (tc->cpu == RING_BUFFER_ALL_CPUS) - tracing_reset_online_cpus(&tr->trace_buffer); + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + int cpu = tracing_get_cpu(inode); + struct array_buffer *trace_buf = &tr->array_buffer; + +#ifdef CONFIG_TRACER_MAX_TRACE + if (tr->current_trace->print_max) + trace_buf = &tr->max_buffer; +#endif + + if (cpu == RING_BUFFER_ALL_CPUS) + tracing_reset_online_cpus(trace_buf); else - tracing_reset(&tr->trace_buffer, tc->cpu); + tracing_reset_cpu(trace_buf, cpu); } if (file->f_mode & FMODE_READ) { - iter = __tracing_open(tr, tc, inode, file, false); + iter = __tracing_open(inode, file, false); if (IS_ERR(iter)) ret = PTR_ERR(iter); - else if (trace_flags & TRACE_ITER_LATENCY_FMT) + else if (tr->trace_flags & TRACE_ITER(LATENCY_FMT)) iter->iter_flags |= TRACE_FILE_LAT_FMT; } @@ -3093,27 +4915,57 @@ static int tracing_open(struct inode *inode, struct file *file) return ret; } +/* + * Some tracers are not suitable for instance buffers. + * A tracer is always available for the global array (toplevel) + * or if it explicitly states that it is. + */ +static bool +trace_ok_for_array(struct tracer *t, struct trace_array *tr) +{ +#ifdef CONFIG_TRACER_SNAPSHOT + /* arrays with mapped buffer range do not have snapshots */ + if (tr->range_addr_start && t->use_max_tr) + return false; +#endif + return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances; +} + +/* Find the next tracer that this trace array may use */ +static struct tracer * +get_tracer_for_array(struct trace_array *tr, struct tracer *t) +{ + while (t && !trace_ok_for_array(t, tr)) + t = t->next; + + return t; +} + static void * t_next(struct seq_file *m, void *v, loff_t *pos) { + struct trace_array *tr = m->private; struct tracer *t = v; (*pos)++; if (t) - t = t->next; + t = get_tracer_for_array(tr, t->next); return t; } static void *t_start(struct seq_file *m, loff_t *pos) { + struct trace_array *tr = m->private; struct tracer *t; loff_t l = 0; mutex_lock(&trace_types_lock); - for (t = trace_types; t && l < *pos; t = t_next(m, t, &l)) - ; + + t = get_tracer_for_array(tr, trace_types); + for (; t && l < *pos; t = t_next(m, t, &l)) + ; return t; } @@ -3130,7 +4982,7 @@ static int t_show(struct seq_file *m, void *v) if (!t) return 0; - seq_printf(m, "%s", t->name); + seq_puts(m, t->name); if (t->next) seq_putc(m, ' '); else @@ -3148,10 +5000,32 @@ static const struct seq_operations show_traces_seq_ops = { static int show_traces_open(struct inode *inode, struct file *file) { - if (tracing_disabled) - return -ENODEV; + struct trace_array *tr = inode->i_private; + struct seq_file *m; + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; - return seq_open(file, &show_traces_seq_ops); + ret = seq_open(file, &show_traces_seq_ops); + if (ret) { + trace_array_put(tr); + return ret; + } + + m = file->private_data; + m->private = tr; + + return 0; +} + +static int tracing_seq_release(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + return seq_release(inode, file); } static ssize_t @@ -3161,123 +5035,132 @@ tracing_write_stub(struct file *filp, const char __user *ubuf, return count; } -static loff_t tracing_seek(struct file *file, loff_t offset, int origin) +loff_t tracing_lseek(struct file *file, loff_t offset, int whence) { + int ret; + if (file->f_mode & FMODE_READ) - return seq_lseek(file, offset, origin); + ret = seq_lseek(file, offset, whence); else - return 0; + file->f_pos = ret = 0; + + return ret; } static const struct file_operations tracing_fops = { .open = tracing_open, .read = seq_read, + .read_iter = seq_read_iter, + .splice_read = copy_splice_read, .write = tracing_write_stub, - .llseek = tracing_seek, + .llseek = tracing_lseek, .release = tracing_release, }; static const struct file_operations show_traces_fops = { .open = show_traces_open, .read = seq_read, - .release = seq_release, .llseek = seq_lseek, + .release = tracing_seq_release, }; -/* - * Only trace on a CPU if the bitmask is set: - */ -static cpumask_var_t tracing_cpumask; - -/* - * The tracer itself will not take this lock, but still we want - * to provide a consistent cpumask to user-space: - */ -static DEFINE_MUTEX(tracing_cpumask_update_lock); - -/* - * Temporary storage for the character representation of the - * CPU bitmask (and one more byte for the newline): - */ -static char mask_str[NR_CPUS + 1]; - static ssize_t tracing_cpumask_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { + struct trace_array *tr = file_inode(filp)->i_private; + char *mask_str __free(kfree) = NULL; int len; - mutex_lock(&tracing_cpumask_update_lock); - - len = cpumask_scnprintf(mask_str, count, tracing_cpumask); - if (count - len < 2) { - count = -EINVAL; - goto out_err; - } - len += sprintf(mask_str + len, "\n"); - count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); + len = snprintf(NULL, 0, "%*pb\n", + cpumask_pr_args(tr->tracing_cpumask)) + 1; + mask_str = kmalloc(len, GFP_KERNEL); + if (!mask_str) + return -ENOMEM; -out_err: - mutex_unlock(&tracing_cpumask_update_lock); + len = snprintf(mask_str, len, "%*pb\n", + cpumask_pr_args(tr->tracing_cpumask)); + if (len >= count) + return -EINVAL; - return count; + return simple_read_from_buffer(ubuf, count, ppos, mask_str, len); } -static ssize_t -tracing_cpumask_write(struct file *filp, const char __user *ubuf, - size_t count, loff_t *ppos) +int tracing_set_cpumask(struct trace_array *tr, + cpumask_var_t tracing_cpumask_new) { - struct trace_array *tr = filp->private_data; - cpumask_var_t tracing_cpumask_new; - int err, cpu; - - if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) - return -ENOMEM; - - err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); - if (err) - goto err_unlock; + int cpu; - mutex_lock(&tracing_cpumask_update_lock); + if (!tr) + return -EINVAL; local_irq_disable(); - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&tr->max_lock); for_each_tracing_cpu(cpu) { /* * Increase/decrease the disabled counter if we are * about to flip a bit in the cpumask: */ - if (cpumask_test_cpu(cpu, tracing_cpumask) && + if (cpumask_test_cpu(cpu, tr->tracing_cpumask) && !cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); - ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu); + ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu); +#ifdef CONFIG_TRACER_MAX_TRACE + ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu); +#endif } - if (!cpumask_test_cpu(cpu, tracing_cpumask) && + if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); - ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); + ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu); +#ifdef CONFIG_TRACER_MAX_TRACE + ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu); +#endif } } - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&tr->max_lock); local_irq_enable(); - cpumask_copy(tracing_cpumask, tracing_cpumask_new); + cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); + + return 0; +} + +static ssize_t +tracing_cpumask_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct trace_array *tr = file_inode(filp)->i_private; + cpumask_var_t tracing_cpumask_new; + int err; + + if (count == 0 || count > KMALLOC_MAX_SIZE) + return -EINVAL; + + if (!zalloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) + return -ENOMEM; + + err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); + if (err) + goto err_free; + + err = tracing_set_cpumask(tr, tracing_cpumask_new); + if (err) + goto err_free; - mutex_unlock(&tracing_cpumask_update_lock); free_cpumask_var(tracing_cpumask_new); return count; -err_unlock: +err_free: free_cpumask_var(tracing_cpumask_new); return err; } static const struct file_operations tracing_cpumask_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .read = tracing_cpumask_read, .write = tracing_cpumask_write, + .release = tracing_release_generic_tr, .llseek = generic_file_llseek, }; @@ -3285,38 +5168,45 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) { struct tracer_opt *trace_opts; struct trace_array *tr = m->private; + struct tracer_flags *flags; u32 tracer_flags; int i; - mutex_lock(&trace_types_lock); - tracer_flags = tr->current_trace->flags->val; - trace_opts = tr->current_trace->flags->opts; + guard(mutex)(&trace_types_lock); for (i = 0; trace_options[i]; i++) { - if (trace_flags & (1 << i)) + if (tr->trace_flags & (1ULL << i)) seq_printf(m, "%s\n", trace_options[i]); else seq_printf(m, "no%s\n", trace_options[i]); } + flags = tr->current_trace_flags; + if (!flags || !flags->opts) + return 0; + + tracer_flags = flags->val; + trace_opts = flags->opts; + for (i = 0; trace_opts[i].name; i++) { if (tracer_flags & trace_opts[i].bit) seq_printf(m, "%s\n", trace_opts[i].name); else seq_printf(m, "no%s\n", trace_opts[i].name); } - mutex_unlock(&trace_types_lock); return 0; } -static int __set_tracer_option(struct tracer *trace, +static int __set_tracer_option(struct trace_array *tr, struct tracer_flags *tracer_flags, struct tracer_opt *opts, int neg) { - int ret; + struct tracer *trace = tracer_flags->trace; + int ret = 0; - ret = trace->set_flag(tracer_flags->val, opts->bit, !neg); + if (trace->set_flag) + ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg); if (ret) return ret; @@ -3328,96 +5218,188 @@ static int __set_tracer_option(struct tracer *trace, } /* Try to assign a tracer specific option */ -static int set_tracer_option(struct tracer *trace, char *cmp, int neg) +static int set_tracer_option(struct trace_array *tr, char *cmp, int neg) { - struct tracer_flags *tracer_flags = trace->flags; + struct tracer_flags *tracer_flags = tr->current_trace_flags; struct tracer_opt *opts = NULL; int i; + if (!tracer_flags || !tracer_flags->opts) + return 0; + for (i = 0; tracer_flags->opts[i].name; i++) { opts = &tracer_flags->opts[i]; if (strcmp(cmp, opts->name) == 0) - return __set_tracer_option(trace, trace->flags, - opts, neg); + return __set_tracer_option(tr, tracer_flags, opts, neg); } return -EINVAL; } /* Some tracers require overwrite to stay enabled */ -int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) +int trace_keep_overwrite(struct tracer *tracer, u64 mask, int set) { - if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set) + if (tracer->enabled && (mask & TRACE_ITER(OVERWRITE)) && !set) return -1; return 0; } -int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) +int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled) { + switch (mask) { + case TRACE_ITER(RECORD_TGID): + case TRACE_ITER(RECORD_CMD): + case TRACE_ITER(TRACE_PRINTK): + case TRACE_ITER(COPY_MARKER): + lockdep_assert_held(&event_mutex); + } + /* do nothing if flag is already set */ - if (!!(trace_flags & mask) == !!enabled) + if (!!(tr->trace_flags & mask) == !!enabled) return 0; /* Give the tracer a chance to approve the change */ if (tr->current_trace->flag_changed) - if (tr->current_trace->flag_changed(tr->current_trace, mask, !!enabled)) + if (tr->current_trace->flag_changed(tr, mask, !!enabled)) return -EINVAL; + switch (mask) { + case TRACE_ITER(TRACE_PRINTK): + if (enabled) { + update_printk_trace(tr); + } else { + /* + * The global_trace cannot clear this. + * It's flag only gets cleared if another instance sets it. + */ + if (printk_trace == &global_trace) + return -EINVAL; + /* + * An instance must always have it set. + * by default, that's the global_trace instance. + */ + if (printk_trace == tr) + update_printk_trace(&global_trace); + } + break; + + case TRACE_ITER(COPY_MARKER): + update_marker_trace(tr, enabled); + /* update_marker_trace updates the tr->trace_flags */ + return 0; + } + if (enabled) - trace_flags |= mask; + tr->trace_flags |= mask; else - trace_flags &= ~mask; + tr->trace_flags &= ~mask; - if (mask == TRACE_ITER_RECORD_CMD) + switch (mask) { + case TRACE_ITER(RECORD_CMD): trace_event_enable_cmd_record(enabled); + break; + + case TRACE_ITER(RECORD_TGID): + + if (trace_alloc_tgid_map() < 0) { + tr->trace_flags &= ~TRACE_ITER(RECORD_TGID); + return -ENOMEM; + } + + trace_event_enable_tgid_record(enabled); + break; + + case TRACE_ITER(EVENT_FORK): + trace_event_follow_fork(tr, enabled); + break; - if (mask == TRACE_ITER_OVERWRITE) { - ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled); + case TRACE_ITER(FUNC_FORK): + ftrace_pid_follow_fork(tr, enabled); + break; + + case TRACE_ITER(OVERWRITE): + ring_buffer_change_overwrite(tr->array_buffer.buffer, enabled); #ifdef CONFIG_TRACER_MAX_TRACE ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled); #endif - } + break; - if (mask == TRACE_ITER_PRINTK) + case TRACE_ITER(PRINTK): trace_printk_start_stop_comm(enabled); + trace_printk_control(enabled); + break; + +#if defined(CONFIG_FUNCTION_PROFILER) && defined(CONFIG_FUNCTION_GRAPH_TRACER) + case TRACE_GRAPH_GRAPH_TIME: + ftrace_graph_graph_time_control(enabled); + break; +#endif + } return 0; } -static int trace_set_options(struct trace_array *tr, char *option) +int trace_set_options(struct trace_array *tr, char *option) { char *cmp; int neg = 0; - int ret = -ENODEV; - int i; + int ret; + size_t orig_len = strlen(option); + int len; cmp = strstrip(option); - if (strncmp(cmp, "no", 2) == 0) { + len = str_has_prefix(cmp, "no"); + if (len) neg = 1; - cmp += 2; - } - mutex_lock(&trace_types_lock); + cmp += len; - for (i = 0; trace_options[i]; i++) { - if (strcmp(cmp, trace_options[i]) == 0) { - ret = set_tracer_flag(tr, 1 << i, !neg); - break; - } - } + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + ret = match_string(trace_options, -1, cmp); /* If no option could be set, test the specific tracer options */ - if (!trace_options[i]) - ret = set_tracer_option(tr->current_trace, cmp, neg); + if (ret < 0) + ret = set_tracer_option(tr, cmp, neg); + else + ret = set_tracer_flag(tr, 1ULL << ret, !neg); mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); + + /* + * If the first trailing whitespace is replaced with '\0' by strstrip, + * turn it back into a space. + */ + if (orig_len > strlen(option)) + option[strlen(option)] = ' '; return ret; } +static void __init apply_trace_boot_options(void) +{ + char *buf = trace_boot_options_buf; + char *option; + + while (true) { + option = strsep(&buf, ","); + + if (!option) + break; + + if (*option) + trace_set_options(&global_trace, option); + + /* Put back the comma to allow this to be called again */ + if (buf) + *(buf - 1) = ','; + } +} + static ssize_t tracing_trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -3430,7 +5412,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, if (cnt >= sizeof(buf)) return -EINVAL; - if (copy_from_user(&buf, ubuf, cnt)) + if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; @@ -3447,14 +5429,17 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, static int tracing_trace_options_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; + int ret; - if (tracing_disabled) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = single_open(file, tracing_trace_options_show, inode->i_private); + if (ret < 0) + trace_array_put(tr); - return single_open(file, tracing_trace_options_show, inode->i_private); + return ret; } static const struct file_operations tracing_iter_fops = { @@ -3467,6 +5452,10 @@ static const struct file_operations tracing_iter_fops = { static const char readme_msg[] = "tracing mini-HOWTO:\n\n" + "By default tracefs removes all OTH file permission bits.\n" + "When mounting tracefs an optional group id can be specified\n" + "which adds the group to every directory and file in tracefs:\n\n" + "\t e.g. mount -t tracefs [-o [gid=<gid>]] nodev /sys/kernel/tracing\n\n" "# echo 0 > tracing_on : quick way to disable tracing\n" "# echo 1 > tracing_on : quick way to re-enable tracing\n\n" " Important files:\n" @@ -3475,9 +5464,10 @@ static const char readme_msg[] = " trace_pipe\t\t- A consuming read to see the contents of the buffer\n" " current_tracer\t- function and latency tracers\n" " available_tracers\t- list of configured tracers for current_tracer\n" + " error_log\t- error log for failed commands (that support it)\n" " buffer_size_kb\t- view and modify size of per cpu buffer\n" " buffer_total_size_kb - view total size of all cpu buffers\n\n" - " trace_clock\t\t-change the clock used to order events\n" + " trace_clock\t\t- change the clock used to order events\n" " local: Per cpu clock but may not be synced across CPUs\n" " global: Synced across CPUs but slows tracing down.\n" " counter: Not a clock, but just an increment\n" @@ -3486,65 +5476,276 @@ static const char readme_msg[] = #ifdef CONFIG_X86_64 " x86-tsc: TSC cycle counter\n" #endif + "\n timestamp_mode\t- view the mode used to timestamp events\n" + " delta: Delta difference against a buffer-wide timestamp\n" + " absolute: Absolute (standalone) timestamp\n" "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n" + "\n trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n" " tracing_cpumask\t- Limit which CPUs to trace\n" " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" "\t\t\t Remove sub-buffer with rmdir\n" " trace_options\t\t- Set format or modify how tracing happens\n" - "\t\t\t Disable an option by adding a suffix 'no' to the option name\n" + "\t\t\t Disable an option by prefixing 'no' to the\n" + "\t\t\t option name\n" + " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n" #ifdef CONFIG_DYNAMIC_FTRACE "\n available_filter_functions - list of functions that can be filtered on\n" - " set_ftrace_filter\t- echo function name in here to only trace these functions\n" - " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" - " modules: Can select a group via module\n" - " Format: :mod:<module-name>\n" - " example: echo :mod:ext3 > set_ftrace_filter\n" - " triggers: a command to perform when function is hit\n" - " Format: <function>:<trigger>[:count]\n" - " trigger: traceon, traceoff\n" - " enable_event:<system>:<event>\n" - " disable_event:<system>:<event>\n" + " set_ftrace_filter\t- echo function name in here to only trace these\n" + "\t\t\t functions\n" + "\t accepts: func_full_name or glob-matching-pattern\n" + "\t modules: Can select a group via module\n" + "\t Format: :mod:<module-name>\n" + "\t example: echo :mod:ext3 > set_ftrace_filter\n" + "\t triggers: a command to perform when function is hit\n" + "\t Format: <function>:<trigger>[:count]\n" + "\t trigger: traceon, traceoff\n" + "\t\t enable_event:<system>:<event>\n" + "\t\t disable_event:<system>:<event>\n" #ifdef CONFIG_STACKTRACE - " stacktrace\n" + "\t\t stacktrace\n" #endif #ifdef CONFIG_TRACER_SNAPSHOT - " snapshot\n" + "\t\t snapshot\n" #endif - " example: echo do_fault:traceoff > set_ftrace_filter\n" - " echo do_trap:traceoff:3 > set_ftrace_filter\n" - " The first one will disable tracing every time do_fault is hit\n" - " The second will disable tracing at most 3 times when do_trap is hit\n" - " The first time do trap is hit and it disables tracing, the counter\n" - " will decrement to 2. If tracing is already disabled, the counter\n" - " will not decrement. It only decrements when the trigger did work\n" - " To remove trigger without count:\n" - " echo '!<function>:<trigger> > set_ftrace_filter\n" - " To remove trigger with a count:\n" - " echo '!<function>:<trigger>:0 > set_ftrace_filter\n" + "\t\t dump\n" + "\t\t cpudump\n" + "\t example: echo do_fault:traceoff > set_ftrace_filter\n" + "\t echo do_trap:traceoff:3 > set_ftrace_filter\n" + "\t The first one will disable tracing every time do_fault is hit\n" + "\t The second will disable tracing at most 3 times when do_trap is hit\n" + "\t The first time do trap is hit and it disables tracing, the\n" + "\t counter will decrement to 2. If tracing is already disabled,\n" + "\t the counter will not decrement. It only decrements when the\n" + "\t trigger did work\n" + "\t To remove trigger without count:\n" + "\t echo '!<function>:<trigger> > set_ftrace_filter\n" + "\t To remove trigger with a count:\n" + "\t echo '!<function>:<trigger>:0 > set_ftrace_filter\n" " set_ftrace_notrace\t- echo function name in here to never trace.\n" - " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" - " modules: Can select a group via module command :mod:\n" - " Does not accept triggers\n" + "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" + "\t modules: Can select a group via module command :mod:\n" + "\t Does not accept triggers\n" #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_TRACER - " set_ftrace_pid\t- Write pid(s) to only function trace those pids (function)\n" + " set_ftrace_pid\t- Write pid(s) to only function trace those pids\n" + "\t\t (function)\n" + " set_ftrace_notrace_pid\t- Write pid(s) to not function trace those pids\n" + "\t\t (function)\n" #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" + " set_graph_notrace\t- Do not trace the nested calls of a function (function_graph)\n" " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" #endif #ifdef CONFIG_TRACER_SNAPSHOT - "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n" - "\t\t\t Read the contents for more information\n" + "\n snapshot\t\t- Like 'trace' but shows the content of the static\n" + "\t\t\t snapshot buffer. Read the contents for more\n" + "\t\t\t information\n" #endif -#ifdef CONFIG_STACKTRACE +#ifdef CONFIG_STACK_TRACER " stack_trace\t\t- Shows the max stack trace when active\n" " stack_max_size\t- Shows current max stack size that was traced\n" - "\t\t\t Write into this file to reset the max size (trigger a new trace)\n" + "\t\t\t Write into this file to reset the max size (trigger a\n" + "\t\t\t new trace)\n" #ifdef CONFIG_DYNAMIC_FTRACE - " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n" + " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace\n" + "\t\t\t traces\n" +#endif +#endif /* CONFIG_STACK_TRACER */ +#ifdef CONFIG_DYNAMIC_EVENTS + " dynamic_events\t\t- Create/append/remove/show the generic dynamic events\n" + "\t\t\t Write into this file to define/undefine new trace events.\n" +#endif +#ifdef CONFIG_KPROBE_EVENTS + " kprobe_events\t\t- Create/append/remove/show the kernel dynamic events\n" + "\t\t\t Write into this file to define/undefine new trace events.\n" +#endif +#ifdef CONFIG_UPROBE_EVENTS + " uprobe_events\t\t- Create/append/remove/show the userspace dynamic events\n" + "\t\t\t Write into this file to define/undefine new trace events.\n" +#endif +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) || \ + defined(CONFIG_FPROBE_EVENTS) + "\t accepts: event-definitions (one definition per line)\n" +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) + "\t Format: p[:[<group>/][<event>]] <place> [<args>]\n" + "\t r[maxactive][:[<group>/][<event>]] <place> [<args>]\n" +#endif +#ifdef CONFIG_FPROBE_EVENTS + "\t f[:[<group>/][<event>]] <func-name>[%return] [<args>]\n" + "\t t[:[<group>/][<event>]] <tracepoint> [<args>]\n" +#endif +#ifdef CONFIG_HIST_TRIGGERS + "\t s:[synthetic/]<event> <field> [<field>]\n" +#endif + "\t e[:[<group>/][<event>]] <attached-group>.<attached-event> [<args>] [if <filter>]\n" + "\t -:[<group>/][<event>]\n" +#ifdef CONFIG_KPROBE_EVENTS + "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" + "place (kretprobe): [<module>:]<symbol>[+<offset>]%return|<memaddr>\n" +#endif +#ifdef CONFIG_UPROBE_EVENTS + " place (uprobe): <path>:<offset>[%return][(ref_ctr_offset)]\n" +#endif + "\t args: <name>=fetcharg[:type]\n" + "\t fetcharg: (%<register>|$<efield>), @<address>, @<symbol>[+|-<offset>],\n" +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n" +#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS + "\t <argname>[->field[->field|.field...]],\n" +#endif +#else + "\t $stack<index>, $stack, $retval, $comm,\n" +#endif + "\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n" + "\t kernel return probes support: $retval, $arg<N>, $comm\n" + "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n" + "\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n" + "\t symstr, %pd/%pD, <type>\\[<array-size>\\]\n" +#ifdef CONFIG_HIST_TRIGGERS + "\t field: <stype> <name>;\n" + "\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n" + "\t [unsigned] char/int/long\n" +#endif + "\t efield: For event probes ('e' types), the field is on of the fields\n" + "\t of the <attached-group>/<attached-event>.\n" +#endif + " set_event\t\t- Enables events by name written into it\n" + "\t\t\t Can enable module events via: :mod:<module>\n" + " events/\t\t- Directory containing all trace event subsystems:\n" + " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" + " events/<system>/\t- Directory containing all trace events for <system>:\n" + " enable\t\t- Write 0/1 to enable/disable tracing of all <system>\n" + "\t\t\t events\n" + " filter\t\t- If set, only events passing filter are traced\n" + " events/<system>/<event>/\t- Directory containing control files for\n" + "\t\t\t <event>:\n" + " enable\t\t- Write 0/1 to enable/disable tracing of <event>\n" + " filter\t\t- If set, only events passing filter are traced\n" + " trigger\t\t- If set, a command to perform when event is hit\n" + "\t Format: <trigger>[:count][if <filter>]\n" + "\t trigger: traceon, traceoff\n" + "\t enable_event:<system>:<event>\n" + "\t disable_event:<system>:<event>\n" +#ifdef CONFIG_HIST_TRIGGERS + "\t enable_hist:<system>:<event>\n" + "\t disable_hist:<system>:<event>\n" +#endif +#ifdef CONFIG_STACKTRACE + "\t\t stacktrace\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT + "\t\t snapshot\n" +#endif +#ifdef CONFIG_HIST_TRIGGERS + "\t\t hist (see below)\n" +#endif + "\t example: echo traceoff > events/block/block_unplug/trigger\n" + "\t echo traceoff:3 > events/block/block_unplug/trigger\n" + "\t echo 'enable_event:kmem:kmalloc:3 if nr_rq > 1' > \\\n" + "\t events/block/block_unplug/trigger\n" + "\t The first disables tracing every time block_unplug is hit.\n" + "\t The second disables tracing the first 3 times block_unplug is hit.\n" + "\t The third enables the kmalloc event the first 3 times block_unplug\n" + "\t is hit and has value of greater than 1 for the 'nr_rq' event field.\n" + "\t Like function triggers, the counter is only decremented if it\n" + "\t enabled or disabled tracing.\n" + "\t To remove a trigger without a count:\n" + "\t echo '!<trigger> > <system>/<event>/trigger\n" + "\t To remove a trigger with a count:\n" + "\t echo '!<trigger>:0 > <system>/<event>/trigger\n" + "\t Filters can be ignored when removing a trigger.\n" +#ifdef CONFIG_HIST_TRIGGERS + " hist trigger\t- If set, event hits are aggregated into a hash table\n" + "\t Format: hist:keys=<field1[,field2,...]>\n" + "\t [:<var1>=<field|var_ref|numeric_literal>[,<var2>=...]]\n" + "\t [:values=<field1[,field2,...]>]\n" + "\t [:sort=<field1[,field2,...]>]\n" + "\t [:size=#entries]\n" + "\t [:pause][:continue][:clear]\n" + "\t [:name=histname1]\n" + "\t [:nohitcount]\n" + "\t [:<handler>.<action>]\n" + "\t [if <filter>]\n\n" + "\t Note, special fields can be used as well:\n" + "\t common_timestamp - to record current timestamp\n" + "\t common_cpu - to record the CPU the event happened on\n" + "\n" + "\t A hist trigger variable can be:\n" + "\t - a reference to a field e.g. x=current_timestamp,\n" + "\t - a reference to another variable e.g. y=$x,\n" + "\t - a numeric literal: e.g. ms_per_sec=1000,\n" + "\t - an arithmetic expression: e.g. time_secs=current_timestamp/1000\n" + "\n" + "\t hist trigger arithmetic expressions support addition(+), subtraction(-),\n" + "\t multiplication(*) and division(/) operators. An operand can be either a\n" + "\t variable reference, field or numeric literal.\n" + "\n" + "\t When a matching event is hit, an entry is added to a hash\n" + "\t table using the key(s) and value(s) named, and the value of a\n" + "\t sum called 'hitcount' is incremented. Keys and values\n" + "\t correspond to fields in the event's format description. Keys\n" + "\t can be any field, or the special string 'common_stacktrace'.\n" + "\t Compound keys consisting of up to two fields can be specified\n" + "\t by the 'keys' keyword. Values must correspond to numeric\n" + "\t fields. Sort keys consisting of up to two fields can be\n" + "\t specified using the 'sort' keyword. The sort direction can\n" + "\t be modified by appending '.descending' or '.ascending' to a\n" + "\t sort field. The 'size' parameter can be used to specify more\n" + "\t or fewer than the default 2048 entries for the hashtable size.\n" + "\t If a hist trigger is given a name using the 'name' parameter,\n" + "\t its histogram data will be shared with other triggers of the\n" + "\t same name, and trigger hits will update this common data.\n\n" + "\t Reading the 'hist' file for the event will dump the hash\n" + "\t table in its entirety to stdout. If there are multiple hist\n" + "\t triggers attached to an event, there will be a table for each\n" + "\t trigger in the output. The table displayed for a named\n" + "\t trigger will be the same as any other instance having the\n" + "\t same name. The default format used to display a given field\n" + "\t can be modified by appending any of the following modifiers\n" + "\t to the field name, as applicable:\n\n" + "\t .hex display a number as a hex value\n" + "\t .sym display an address as a symbol\n" + "\t .sym-offset display an address as a symbol and offset\n" + "\t .execname display a common_pid as a program name\n" + "\t .syscall display a syscall id as a syscall name\n" + "\t .log2 display log2 value rather than raw number\n" + "\t .buckets=size display values in groups of size rather than raw number\n" + "\t .usecs display a common_timestamp in microseconds\n" + "\t .percent display a number of percentage value\n" + "\t .graph display a bar-graph of a value\n\n" + "\t The 'pause' parameter can be used to pause an existing hist\n" + "\t trigger or to start a hist trigger but not log any events\n" + "\t until told to do so. 'continue' can be used to start or\n" + "\t restart a paused hist trigger.\n\n" + "\t The 'clear' parameter will clear the contents of a running\n" + "\t hist trigger and leave its current paused/active state\n" + "\t unchanged.\n\n" + "\t The 'nohitcount' (or NOHC) parameter will suppress display of\n" + "\t raw hitcount in the histogram.\n\n" + "\t The enable_hist and disable_hist triggers can be used to\n" + "\t have one event conditionally start and stop another event's\n" + "\t already-attached hist trigger. The syntax is analogous to\n" + "\t the enable_event and disable_event triggers.\n\n" + "\t Hist trigger handlers and actions are executed whenever a\n" + "\t a histogram entry is added or updated. They take the form:\n\n" + "\t <handler>.<action>\n\n" + "\t The available handlers are:\n\n" + "\t onmatch(matching.event) - invoke on addition or update\n" + "\t onmax(var) - invoke if var exceeds current max\n" + "\t onchange(var) - invoke action if var changes\n\n" + "\t The available actions are:\n\n" + "\t trace(<synthetic_event>,param list) - generate synthetic event\n" + "\t save(field,...) - save current event fields\n" +#ifdef CONFIG_TRACER_SNAPSHOT + "\t snapshot() - snapshot the trace buffer\n\n" +#endif +#ifdef CONFIG_SYNTH_EVENTS + " events/synthetic_events\t- Create/append/remove/show synthetic events\n" + "\t Write into this file to define/undefine new synthetic events.\n" + "\t example: echo 'myevent u64 lat; char name[]; long[] stack' >> synthetic_events\n" +#endif #endif -#endif /* CONFIG_STACKTRACE */ ; static ssize_t @@ -3561,57 +5762,191 @@ static const struct file_operations tracing_readme_fops = { .llseek = generic_file_llseek, }; -static ssize_t -tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, - size_t cnt, loff_t *ppos) +#ifdef CONFIG_TRACE_EVAL_MAP_FILE +static union trace_eval_map_item * +update_eval_map(union trace_eval_map_item *ptr) { - char *buf_comm; - char *file_buf; - char *buf; - int len = 0; - int pid; - int i; + if (!ptr->map.eval_string) { + if (ptr->tail.next) { + ptr = ptr->tail.next; + /* Set ptr to the next real item (skip head) */ + ptr++; + } else + return NULL; + } + return ptr; +} - file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL); - if (!file_buf) - return -ENOMEM; +static void *eval_map_next(struct seq_file *m, void *v, loff_t *pos) +{ + union trace_eval_map_item *ptr = v; - buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL); - if (!buf_comm) { - kfree(file_buf); - return -ENOMEM; - } + /* + * Paranoid! If ptr points to end, we don't want to increment past it. + * This really should never happen. + */ + (*pos)++; + ptr = update_eval_map(ptr); + if (WARN_ON_ONCE(!ptr)) + return NULL; - buf = file_buf; + ptr++; + ptr = update_eval_map(ptr); - for (i = 0; i < SAVED_CMDLINES; i++) { - int r; + return ptr; +} - pid = map_cmdline_to_pid[i]; - if (pid == -1 || pid == NO_CMDLINE_MAP) - continue; +static void *eval_map_start(struct seq_file *m, loff_t *pos) +{ + union trace_eval_map_item *v; + loff_t l = 0; + + mutex_lock(&trace_eval_mutex); + + v = trace_eval_maps; + if (v) + v++; - trace_find_cmdline(pid, buf_comm); - r = sprintf(buf, "%d %s\n", pid, buf_comm); - buf += r; - len += r; + while (v && l < *pos) { + v = eval_map_next(m, v, &l); } - len = simple_read_from_buffer(ubuf, cnt, ppos, - file_buf, len); + return v; +} - kfree(file_buf); - kfree(buf_comm); +static void eval_map_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&trace_eval_mutex); +} - return len; +static int eval_map_show(struct seq_file *m, void *v) +{ + union trace_eval_map_item *ptr = v; + + seq_printf(m, "%s %ld (%s)\n", + ptr->map.eval_string, ptr->map.eval_value, + ptr->map.system); + + return 0; } -static const struct file_operations tracing_saved_cmdlines_fops = { - .open = tracing_open_generic, - .read = tracing_saved_cmdlines_read, - .llseek = generic_file_llseek, +static const struct seq_operations tracing_eval_map_seq_ops = { + .start = eval_map_start, + .next = eval_map_next, + .stop = eval_map_stop, + .show = eval_map_show, }; +static int tracing_eval_map_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; + + return seq_open(filp, &tracing_eval_map_seq_ops); +} + +static const struct file_operations tracing_eval_map_fops = { + .open = tracing_eval_map_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static inline union trace_eval_map_item * +trace_eval_jmp_to_tail(union trace_eval_map_item *ptr) +{ + /* Return tail of array given the head */ + return ptr + ptr->head.length + 1; +} + +static void +trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, + int len) +{ + struct trace_eval_map **stop; + struct trace_eval_map **map; + union trace_eval_map_item *map_array; + union trace_eval_map_item *ptr; + + stop = start + len; + + /* + * The trace_eval_maps contains the map plus a head and tail item, + * where the head holds the module and length of array, and the + * tail holds a pointer to the next list. + */ + map_array = kmalloc_array(len + 2, sizeof(*map_array), GFP_KERNEL); + if (!map_array) { + pr_warn("Unable to allocate trace eval mapping\n"); + return; + } + + guard(mutex)(&trace_eval_mutex); + + if (!trace_eval_maps) + trace_eval_maps = map_array; + else { + ptr = trace_eval_maps; + for (;;) { + ptr = trace_eval_jmp_to_tail(ptr); + if (!ptr->tail.next) + break; + ptr = ptr->tail.next; + + } + ptr->tail.next = map_array; + } + map_array->head.mod = mod; + map_array->head.length = len; + map_array++; + + for (map = start; (unsigned long)map < (unsigned long)stop; map++) { + map_array->map = **map; + map_array++; + } + memset(map_array, 0, sizeof(*map_array)); +} + +static void trace_create_eval_file(struct dentry *d_tracer) +{ + trace_create_file("eval_map", TRACE_MODE_READ, d_tracer, + NULL, &tracing_eval_map_fops); +} + +#else /* CONFIG_TRACE_EVAL_MAP_FILE */ +static inline void trace_create_eval_file(struct dentry *d_tracer) { } +static inline void trace_insert_eval_map_file(struct module *mod, + struct trace_eval_map **start, int len) { } +#endif /* !CONFIG_TRACE_EVAL_MAP_FILE */ + +static void +trace_event_update_with_eval_map(struct module *mod, + struct trace_eval_map **start, + int len) +{ + struct trace_eval_map **map; + + /* Always run sanitizer only if btf_type_tag attr exists. */ + if (len <= 0) { + if (!(IS_ENABLED(CONFIG_DEBUG_INFO_BTF) && + IS_ENABLED(CONFIG_PAHOLE_HAS_BTF_TAG) && + __has_attribute(btf_type_tag))) + return; + } + + map = start; + + trace_event_update_all(map, len); + + if (len <= 0) + return; + + trace_insert_eval_map_file(mod, start, len); +} + static ssize_t tracing_set_trace_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -3620,20 +5955,20 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, char buf[MAX_TRACER_SIZE+2]; int r; - mutex_lock(&trace_types_lock); - r = sprintf(buf, "%s\n", tr->current_trace->name); - mutex_unlock(&trace_types_lock); + scoped_guard(mutex, &trace_types_lock) { + r = sprintf(buf, "%s\n", tr->current_trace->name); + } return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } int tracer_init(struct tracer *t, struct trace_array *tr) { - tracing_reset_online_cpus(&tr->trace_buffer); + tracing_reset_online_cpus(&tr->array_buffer); return t->init(tr); } -static void set_buffer_entries(struct trace_buffer *buf, unsigned long val) +static void set_buffer_entries(struct array_buffer *buf, unsigned long val) { int cpu; @@ -3641,10 +5976,19 @@ static void set_buffer_entries(struct trace_buffer *buf, unsigned long val) per_cpu_ptr(buf->data, cpu)->entries = val; } +static void update_buffer_entries(struct array_buffer *buf, int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) { + set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0)); + } else { + per_cpu_ptr(buf->data, cpu)->entries = ring_buffer_size(buf->buffer, cpu); + } +} + #ifdef CONFIG_TRACER_MAX_TRACE /* resize @tr's buffer to the size of @size_tr's entries */ -static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, - struct trace_buffer *size_buf, int cpu_id) +static int resize_buffer_duplicate_size(struct array_buffer *trace_buf, + struct array_buffer *size_buf, int cpu_id) { int cpu, ret = 0; @@ -3679,25 +6023,27 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, * we use the size that was given, and we can forget about * expanding it later. */ - ring_buffer_expanded = true; + trace_set_ring_buffer_expanded(tr); /* May be called before buffers are initialized */ - if (!tr->trace_buffer.buffer) + if (!tr->array_buffer.buffer) return 0; - ret = ring_buffer_resize(tr->trace_buffer.buffer, size, cpu); + /* Do not allow tracing while resizing ring buffer */ + tracing_stop_tr(tr); + + ret = ring_buffer_resize(tr->array_buffer.buffer, size, cpu); if (ret < 0) - return ret; + goto out_start; #ifdef CONFIG_TRACER_MAX_TRACE - if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) || - !tr->current_trace->use_max_tr) + if (!tr->allocated_snapshot) goto out; ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu); if (ret < 0) { - int r = resize_buffer_duplicate_size(&tr->trace_buffer, - &tr->trace_buffer, cpu); + int r = resize_buffer_duplicate_size(&tr->array_buffer, + &tr->array_buffer, cpu); if (r < 0) { /* * AARGH! We are left with different @@ -3716,53 +6062,191 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, WARN_ON(1); tracing_disabled = 1; } - return ret; + goto out_start; } - if (cpu == RING_BUFFER_ALL_CPUS) - set_buffer_entries(&tr->max_buffer, size); - else - per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size; + update_buffer_entries(&tr->max_buffer, cpu); out: #endif /* CONFIG_TRACER_MAX_TRACE */ - if (cpu == RING_BUFFER_ALL_CPUS) - set_buffer_entries(&tr->trace_buffer, size); - else - per_cpu_ptr(tr->trace_buffer.data, cpu)->entries = size; - + update_buffer_entries(&tr->array_buffer, cpu); + out_start: + tracing_start_tr(tr); return ret; } -static ssize_t tracing_resize_ring_buffer(struct trace_array *tr, - unsigned long size, int cpu_id) +ssize_t tracing_resize_ring_buffer(struct trace_array *tr, + unsigned long size, int cpu_id) { - int ret = size; - - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (cpu_id != RING_BUFFER_ALL_CPUS) { /* make sure, this cpu is enabled in the mask */ - if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) { - ret = -EINVAL; - goto out; - } + if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) + return -EINVAL; } - ret = __tracing_resize_ring_buffer(tr, size, cpu_id); - if (ret < 0) - ret = -ENOMEM; + return __tracing_resize_ring_buffer(tr, size, cpu_id); +} -out: - mutex_unlock(&trace_types_lock); +struct trace_mod_entry { + unsigned long mod_addr; + char mod_name[MODULE_NAME_LEN]; +}; - return ret; +struct trace_scratch { + unsigned int clock_id; + unsigned long text_addr; + unsigned long nr_entries; + struct trace_mod_entry entries[]; +}; + +static DEFINE_MUTEX(scratch_mutex); + +static int cmp_mod_entry(const void *key, const void *pivot) +{ + unsigned long addr = (unsigned long)key; + const struct trace_mod_entry *ent = pivot; + + if (addr >= ent[0].mod_addr && addr < ent[1].mod_addr) + return 0; + else + return addr - ent->mod_addr; +} + +/** + * trace_adjust_address() - Adjust prev boot address to current address. + * @tr: Persistent ring buffer's trace_array. + * @addr: Address in @tr which is adjusted. + */ +unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr) +{ + struct trace_module_delta *module_delta; + struct trace_scratch *tscratch; + struct trace_mod_entry *entry; + unsigned long raddr; + int idx = 0, nr_entries; + + /* If we don't have last boot delta, return the address */ + if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + return addr; + + /* tr->module_delta must be protected by rcu. */ + guard(rcu)(); + tscratch = tr->scratch; + /* if there is no tscrach, module_delta must be NULL. */ + module_delta = READ_ONCE(tr->module_delta); + if (!module_delta || !tscratch->nr_entries || + tscratch->entries[0].mod_addr > addr) { + raddr = addr + tr->text_delta; + return __is_kernel(raddr) || is_kernel_core_data(raddr) || + is_kernel_rodata(raddr) ? raddr : addr; + } + + /* Note that entries must be sorted. */ + nr_entries = tscratch->nr_entries; + if (nr_entries == 1 || + tscratch->entries[nr_entries - 1].mod_addr < addr) + idx = nr_entries - 1; + else { + entry = __inline_bsearch((void *)addr, + tscratch->entries, + nr_entries - 1, + sizeof(tscratch->entries[0]), + cmp_mod_entry); + if (entry) + idx = entry - tscratch->entries; + } + + return addr + module_delta->delta[idx]; +} + +#ifdef CONFIG_MODULES +static int save_mod(struct module *mod, void *data) +{ + struct trace_array *tr = data; + struct trace_scratch *tscratch; + struct trace_mod_entry *entry; + unsigned int size; + + tscratch = tr->scratch; + if (!tscratch) + return -1; + size = tr->scratch_size; + + if (struct_size(tscratch, entries, tscratch->nr_entries + 1) > size) + return -1; + + entry = &tscratch->entries[tscratch->nr_entries]; + + tscratch->nr_entries++; + + entry->mod_addr = (unsigned long)mod->mem[MOD_TEXT].base; + strscpy(entry->mod_name, mod->name); + + return 0; +} +#else +static int save_mod(struct module *mod, void *data) +{ + return 0; } +#endif +static void update_last_data(struct trace_array *tr) +{ + struct trace_module_delta *module_delta; + struct trace_scratch *tscratch; + + if (!(tr->flags & TRACE_ARRAY_FL_BOOT)) + return; + + if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + return; + + /* Only if the buffer has previous boot data clear and update it. */ + tr->flags &= ~TRACE_ARRAY_FL_LAST_BOOT; + + /* Reset the module list and reload them */ + if (tr->scratch) { + struct trace_scratch *tscratch = tr->scratch; + + tscratch->clock_id = tr->clock_id; + memset(tscratch->entries, 0, + flex_array_size(tscratch, entries, tscratch->nr_entries)); + tscratch->nr_entries = 0; + + guard(mutex)(&scratch_mutex); + module_for_each_mod(save_mod, tr); + } + + /* + * Need to clear all CPU buffers as there cannot be events + * from the previous boot mixed with events with this boot + * as that will cause a confusing trace. Need to clear all + * CPU buffers, even for those that may currently be offline. + */ + tracing_reset_all_cpus(&tr->array_buffer); + + /* Using current data now */ + tr->text_delta = 0; + + if (!tr->scratch) + return; + + tscratch = tr->scratch; + module_delta = READ_ONCE(tr->module_delta); + WRITE_ONCE(tr->module_delta, NULL); + kfree_rcu(module_delta, rcu); + + /* Set the persistent ring buffer meta data to this address */ + tscratch->text_addr = (unsigned long)_text; +} /** * tracing_update_buffers - used by tracing facility to expand ring buffers + * @tr: The tracing instance * * To save on memory when the tracing is never used on a system with it * configured in. The ring buffers are set to a minimum size. But once @@ -3771,72 +6255,114 @@ out: * * This function is to be called when a tracer is about to be used. */ -int tracing_update_buffers(void) +int tracing_update_buffers(struct trace_array *tr) { int ret = 0; - mutex_lock(&trace_types_lock); - if (!ring_buffer_expanded) - ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size, - RING_BUFFER_ALL_CPUS); - mutex_unlock(&trace_types_lock); + guard(mutex)(&trace_types_lock); + update_last_data(tr); + + if (!tr->ring_buffer_expanded) + ret = __tracing_resize_ring_buffer(tr, trace_buf_size, + RING_BUFFER_ALL_CPUS); return ret; } -struct trace_option_dentry; +/* + * Used to clear out the tracer before deletion of an instance. + * Must have trace_types_lock held. + */ +static void tracing_set_nop(struct trace_array *tr) +{ + if (tr->current_trace == &nop_trace) + return; -static struct trace_option_dentry * -create_trace_option_files(struct trace_array *tr, struct tracer *tracer); + tr->current_trace->enabled--; -static void -destroy_trace_option_files(struct trace_option_dentry *topts); + if (tr->current_trace->reset) + tr->current_trace->reset(tr); -static int tracing_set_tracer(const char *buf) + tr->current_trace = &nop_trace; + tr->current_trace_flags = nop_trace.flags; +} + +static bool tracer_options_updated; + +int tracing_set_tracer(struct trace_array *tr, const char *buf) { - static struct trace_option_dentry *topts; - struct trace_array *tr = &global_trace; - struct tracer *t; + struct tracer *trace = NULL; + struct tracers *t; #ifdef CONFIG_TRACER_MAX_TRACE bool had_max_tr; #endif - int ret = 0; + int ret; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); + + update_last_data(tr); - if (!ring_buffer_expanded) { + if (!tr->ring_buffer_expanded) { ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); if (ret < 0) - goto out; + return ret; ret = 0; } - for (t = trace_types; t; t = t->next) { - if (strcmp(t->name, buf) == 0) + list_for_each_entry(t, &tr->tracers, list) { + if (strcmp(t->tracer->name, buf) == 0) { + trace = t->tracer; break; + } } - if (!t) { - ret = -EINVAL; - goto out; + if (!trace) + return -EINVAL; + + if (trace == tr->current_trace) + return 0; + +#ifdef CONFIG_TRACER_SNAPSHOT + if (trace->use_max_tr) { + local_irq_disable(); + arch_spin_lock(&tr->max_lock); + ret = tr->cond_snapshot ? -EBUSY : 0; + arch_spin_unlock(&tr->max_lock); + local_irq_enable(); + if (ret) + return ret; + } +#endif + /* Some tracers won't work on kernel command line */ + if (system_state < SYSTEM_RUNNING && trace->noboot) { + pr_warn("Tracer '%s' is not allowed on command line, ignored\n", + trace->name); + return -EINVAL; } - if (t == tr->current_trace) - goto out; + + /* Some tracers are only allowed for the top level buffer */ + if (!trace_ok_for_array(trace, tr)) + return -EINVAL; + + /* If trace pipe files are being read, we can't change the tracer */ + if (tr->trace_ref) + return -EBUSY; trace_branch_disable(); - tr->current_trace->enabled = false; + tr->current_trace->enabled--; if (tr->current_trace->reset) tr->current_trace->reset(tr); - /* Current trace needs to be nop_trace before synchronize_sched */ - tr->current_trace = &nop_trace; - #ifdef CONFIG_TRACER_MAX_TRACE - had_max_tr = tr->allocated_snapshot; + had_max_tr = tr->current_trace->use_max_tr; + + /* Current trace needs to be nop_trace before synchronize_rcu */ + tr->current_trace = &nop_trace; + tr->current_trace_flags = nop_trace.flags; - if (had_max_tr && !t->use_max_tr) { + if (had_max_tr && !trace->use_max_tr) { /* * We need to make sure that the update_max_tr sees that * current_trace changed to nop_trace to keep it from @@ -3844,43 +6370,48 @@ static int tracing_set_tracer(const char *buf) * The update_max_tr is called from interrupts disabled * so a synchronized_sched() is sufficient. */ - synchronize_sched(); + synchronize_rcu(); free_snapshot(tr); + tracing_disarm_snapshot(tr); } + + if (!had_max_tr && trace->use_max_tr) { + ret = tracing_arm_snapshot_locked(tr); + if (ret) + return ret; + } +#else + tr->current_trace = &nop_trace; #endif - destroy_trace_option_files(topts); - topts = create_trace_option_files(tr, t); + tr->current_trace_flags = t->flags ? : t->tracer->flags; + if (trace->init) { + ret = tracer_init(trace, tr); + if (ret) { #ifdef CONFIG_TRACER_MAX_TRACE - if (t->use_max_tr && !had_max_tr) { - ret = alloc_snapshot(tr); - if (ret < 0) - goto out; - } + if (trace->use_max_tr) + tracing_disarm_snapshot(tr); #endif - - if (t->init) { - ret = tracer_init(t, tr); - if (ret) - goto out; + tr->current_trace_flags = nop_trace.flags; + return ret; + } } - tr->current_trace = t; - tr->current_trace->enabled = true; + tr->current_trace = trace; + tr->current_trace->enabled++; trace_branch_enable(tr); - out: - mutex_unlock(&trace_types_lock); - return ret; + return 0; } static ssize_t tracing_set_trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { + struct trace_array *tr = filp->private_data; char buf[MAX_TRACER_SIZE+1]; - int i; + char *name; size_t ret; int err; @@ -3889,16 +6420,14 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, if (cnt > MAX_TRACER_SIZE) cnt = MAX_TRACER_SIZE; - if (copy_from_user(&buf, ubuf, cnt)) + if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; - /* strip ending whitespace. */ - for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) - buf[i] = 0; + name = strim(buf); - err = tracing_set_tracer(buf); + err = tracing_set_tracer(tr, name); if (err) return err; @@ -3908,10 +6437,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, } static ssize_t -tracing_max_lat_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) +tracing_nsecs_read(unsigned long *ptr, char __user *ubuf, + size_t cnt, loff_t *ppos) { - unsigned long *ptr = filp->private_data; char buf[64]; int r; @@ -3923,10 +6451,9 @@ tracing_max_lat_read(struct file *filp, char __user *ubuf, } static ssize_t -tracing_max_lat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf, + size_t cnt, loff_t *ppos) { - unsigned long *ptr = filp->private_data; unsigned long val; int ret; @@ -3939,38 +6466,107 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, return cnt; } +static ssize_t +tracing_thresh_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return tracing_nsecs_read(&tracing_thresh, ubuf, cnt, ppos); +} + +static ssize_t +tracing_thresh_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + int ret; + + guard(mutex)(&trace_types_lock); + ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos); + if (ret < 0) + return ret; + + if (tr->current_trace->update_thresh) { + ret = tr->current_trace->update_thresh(tr); + if (ret < 0) + return ret; + } + + return cnt; +} + +#ifdef CONFIG_TRACER_MAX_TRACE + +static ssize_t +tracing_max_lat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + + return tracing_nsecs_read(&tr->max_latency, ubuf, cnt, ppos); +} + +static ssize_t +tracing_max_lat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + + return tracing_nsecs_write(&tr->max_latency, ubuf, cnt, ppos); +} + +#endif + +static int open_pipe_on_cpu(struct trace_array *tr, int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) { + if (cpumask_empty(tr->pipe_cpumask)) { + cpumask_setall(tr->pipe_cpumask); + return 0; + } + } else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) { + cpumask_set_cpu(cpu, tr->pipe_cpumask); + return 0; + } + return -EBUSY; +} + +static void close_pipe_on_cpu(struct trace_array *tr, int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) { + WARN_ON(!cpumask_full(tr->pipe_cpumask)); + cpumask_clear(tr->pipe_cpumask); + } else { + WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask)); + cpumask_clear_cpu(cpu, tr->pipe_cpumask); + } +} + static int tracing_open_pipe(struct inode *inode, struct file *filp) { - struct trace_cpu *tc = inode->i_private; - struct trace_array *tr = tc->tr; + struct trace_array *tr = inode->i_private; struct trace_iterator *iter; - int ret = 0; - - if (tracing_disabled) - return -ENODEV; + int cpu; + int ret; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); + cpu = tracing_get_cpu(inode); + ret = open_pipe_on_cpu(tr, cpu); + if (ret) + goto fail_pipe_on_cpu; /* create a buffer to store the information to pass to userspace */ iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { ret = -ENOMEM; - goto out; + goto fail_alloc_iter; } - /* - * We make a copy of the current tracer to avoid concurrent - * changes on it while we are reading. - */ - iter->trace = kmalloc(sizeof(*iter->trace), GFP_KERNEL); - if (!iter->trace) { - ret = -ENOMEM; - goto fail; - } - *iter->trace = *tr->current_trace; + trace_seq_init(&iter->seq); + iter->trace = tr->current_trace; if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { ret = -ENOMEM; @@ -3980,16 +6576,16 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) /* trace pipe does not show start of buffer */ cpumask_setall(iter->started); - if (trace_flags & TRACE_ITER_LATENCY_FMT) + if (tr->trace_flags & TRACE_ITER(LATENCY_FMT)) iter->iter_flags |= TRACE_FILE_LAT_FMT; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ if (trace_clocks[tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; - iter->cpu_file = tc->cpu; - iter->tr = tc->tr; - iter->trace_buffer = &tc->tr->trace_buffer; + iter->tr = tr; + iter->array_buffer = &tr->array_buffer; + iter->cpu_file = cpu; mutex_init(&iter->mutex); filp->private_data = iter; @@ -3997,34 +6593,34 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) iter->trace->pipe_open(iter); nonseekable_open(inode, filp); -out: - mutex_unlock(&trace_types_lock); + + tr->trace_ref++; + return ret; fail: - kfree(iter->trace); kfree(iter); +fail_alloc_iter: + close_pipe_on_cpu(tr, cpu); +fail_pipe_on_cpu: __trace_array_put(tr); - mutex_unlock(&trace_types_lock); return ret; } static int tracing_release_pipe(struct inode *inode, struct file *file) { struct trace_iterator *iter = file->private_data; - struct trace_cpu *tc = inode->i_private; - struct trace_array *tr = tc->tr; - - mutex_lock(&trace_types_lock); + struct trace_array *tr = inode->i_private; - if (iter->trace->pipe_close) - iter->trace->pipe_close(iter); + scoped_guard(mutex, &trace_types_lock) { + tr->trace_ref--; - mutex_unlock(&trace_types_lock); + if (iter->trace->pipe_close) + iter->trace->pipe_close(iter); + close_pipe_on_cpu(tr, iter->cpu_file); + } - free_cpumask_var(iter->started); - mutex_destroy(&iter->mutex); - kfree(iter->trace); + free_trace_iter_content(iter); kfree(iter); trace_array_put(tr); @@ -4032,24 +6628,26 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) return 0; } -static unsigned int +static __poll_t trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table) { + struct trace_array *tr = iter->tr; + /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) - return POLLIN | POLLRDNORM; + return EPOLLIN | EPOLLRDNORM; - if (trace_flags & TRACE_ITER_BLOCK) + if (tr->trace_flags & TRACE_ITER(BLOCK)) /* * Always select as readable when in blocking mode */ - return POLLIN | POLLRDNORM; + return EPOLLIN | EPOLLRDNORM; else - return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file, - filp, poll_table); + return ring_buffer_poll_wait(iter->array_buffer->buffer, iter->cpu_file, + filp, poll_table, iter->tr->buffer_percent); } -static unsigned int +static __poll_t tracing_poll_pipe(struct file *filp, poll_table *poll_table) { struct trace_iterator *iter = filp->private_data; @@ -4057,29 +6655,11 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table) return trace_poll(iter, filp, poll_table); } -/* - * This is a make-shift waitqueue. - * A tracer might use this callback on some rare cases: - * - * 1) the current tracer might hold the runqueue lock when it wakes up - * a reader, hence a deadlock (sched, function, and function graph tracers) - * 2) the function tracers, trace all functions, we don't want - * the overhead of calling wake_up and friends - * (and tracing them too) - * - * Anyway, this is really very primitive wakeup. - */ -void poll_wait_pipe(struct trace_iterator *iter) -{ - set_current_state(TASK_INTERRUPTIBLE); - /* sleep for 100 msecs, and try again. */ - schedule_timeout(HZ / 10); -} - -/* Must be called with trace_types_lock mutex held. */ +/* Must be called with iter->mutex held. */ static int tracing_wait_pipe(struct file *filp) { struct trace_iterator *iter = filp->private_data; + int ret; while (trace_empty(iter)) { @@ -4087,15 +6667,6 @@ static int tracing_wait_pipe(struct file *filp) return -EAGAIN; } - mutex_unlock(&iter->mutex); - - iter->trace->wait_pipe(iter); - - mutex_lock(&iter->mutex); - - if (signal_pending(current)) - return -EINTR; - /* * We block until we read something and tracing is disabled. * We still block if tracing is disabled, but we have never @@ -4105,13 +6676,38 @@ static int tracing_wait_pipe(struct file *filp) * * iter->pos will be 0 if we haven't read anything. */ - if (!tracing_is_on() && iter->pos) + if (!tracer_tracing_is_on(iter->tr) && iter->pos) break; + + mutex_unlock(&iter->mutex); + + ret = wait_on_pipe(iter, 0); + + mutex_lock(&iter->mutex); + + if (ret) + return ret; } return 1; } +static bool update_last_data_if_empty(struct trace_array *tr) +{ + if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + return false; + + if (!ring_buffer_empty(tr->array_buffer.buffer)) + return false; + + /* + * If the buffer contains the last boot data and all per-cpu + * buffers are empty, reset it from the kernel side. + */ + update_last_data(tr); + return true; +} + /* * Consumer reader. */ @@ -4120,9 +6716,15 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_iterator *iter = filp->private_data; - struct trace_array *tr = iter->tr; ssize_t sret; + /* + * Avoid more than one consumer on a single file descriptor + * This is just a matter of traces coherency, the ring buffer itself + * is protected. + */ + guard(mutex)(&iter->mutex); + /* return any leftover data */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); if (sret != -EBUSY) @@ -4130,60 +6732,61 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, trace_seq_init(&iter->seq); - /* copy the tracer to avoid using a global lock all around */ - mutex_lock(&trace_types_lock); - if (unlikely(iter->trace->name != tr->current_trace->name)) - *iter->trace = *tr->current_trace; - mutex_unlock(&trace_types_lock); - - /* - * Avoid more than one consumer on a single file descriptor - * This is just a matter of traces coherency, the ring buffer itself - * is protected. - */ - mutex_lock(&iter->mutex); if (iter->trace->read) { sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); if (sret) - goto out; + return sret; } waitagain: + if (update_last_data_if_empty(iter->tr)) + return 0; + sret = tracing_wait_pipe(filp); if (sret <= 0) - goto out; + return sret; /* stop when tracing is finished */ - if (trace_empty(iter)) { - sret = 0; - goto out; - } + if (trace_empty(iter)) + return 0; - if (cnt >= PAGE_SIZE) - cnt = PAGE_SIZE - 1; + if (cnt >= TRACE_SEQ_BUFFER_SIZE) + cnt = TRACE_SEQ_BUFFER_SIZE - 1; /* reset all but tr, trace, and overruns */ - memset(&iter->seq, 0, - sizeof(struct trace_iterator) - - offsetof(struct trace_iterator, seq)); - iter->pos = -1; + trace_iterator_reset(iter); + cpumask_clear(iter->started); + trace_seq_init(&iter->seq); trace_event_read_lock(); trace_access_lock(iter->cpu_file); while (trace_find_next_entry_inc(iter) != NULL) { enum print_line_t ret; - int len = iter->seq.len; + int save_len = iter->seq.seq.len; ret = print_trace_line(iter); if (ret == TRACE_TYPE_PARTIAL_LINE) { - /* don't print partial lines */ - iter->seq.len = len; + /* + * If one print_trace_line() fills entire trace_seq in one shot, + * trace_seq_to_user() will returns -EBUSY because save_len == 0, + * In this case, we need to consume it, otherwise, loop will peek + * this event next time, resulting in an infinite loop. + */ + if (save_len == 0) { + iter->seq.full = 0; + trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n"); + trace_consume(iter); + break; + } + + /* In other cases, don't print partial lines */ + iter->seq.seq.len = save_len; break; } if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(iter); - if (iter->seq.len >= cnt) + if (trace_seq_used(&iter->seq) >= cnt) break; /* @@ -4199,7 +6802,7 @@ waitagain: /* Now copy what we have to the user */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); - if (iter->seq.readpos >= iter->seq.len) + if (iter->seq.readpos >= trace_seq_used(&iter->seq)) trace_seq_init(&iter->seq); /* @@ -4209,52 +6812,46 @@ waitagain: if (sret == -EBUSY) goto waitagain; -out: - mutex_unlock(&iter->mutex); - return sret; } -static void tracing_pipe_buf_release(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - __free_page(buf->page); -} - static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, unsigned int idx) { __free_page(spd->pages[idx]); } -static const struct pipe_buf_operations tracing_pipe_buf_ops = { - .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, - .confirm = generic_pipe_buf_confirm, - .release = tracing_pipe_buf_release, - .steal = generic_pipe_buf_steal, - .get = generic_pipe_buf_get, -}; - static size_t tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) { size_t count; + int save_len; int ret; /* Seq buffer is page-sized, exactly what we need. */ for (;;) { - count = iter->seq.len; + save_len = iter->seq.seq.len; ret = print_trace_line(iter); - count = iter->seq.len - count; - if (rem < count) { - rem = 0; - iter->seq.len -= count; + + if (trace_seq_has_overflowed(&iter->seq)) { + iter->seq.seq.len = save_len; break; } + + /* + * This should not be hit, because it should only + * be set if the iter->seq overflowed. But check it + * anyway to be safe. + */ if (ret == TRACE_TYPE_PARTIAL_LINE) { - iter->seq.len -= count; + iter->seq.seq.len = save_len; + break; + } + + count = trace_seq_used(&iter->seq) - save_len; + if (rem < count) { + rem = 0; + iter->seq.seq.len = save_len; break; } @@ -4285,11 +6882,9 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .partial = partial_def, .nr_pages = 0, /* This gets updated below. */ .nr_pages_max = PIPE_DEF_BUFFERS, - .flags = flags, - .ops = &tracing_pipe_buf_ops, + .ops = &default_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, }; - struct trace_array *tr = iter->tr; ssize_t ret; size_t rem; unsigned int i; @@ -4297,12 +6892,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, if (splice_grow_spd(pipe, &spd)) return -ENOMEM; - /* copy the tracer to avoid using a global lock all around */ - mutex_lock(&trace_types_lock); - if (unlikely(iter->trace->name != tr->current_trace->name)) - *iter->trace = *tr->current_trace; - mutex_unlock(&trace_types_lock); - mutex_lock(&iter->mutex); if (iter->trace->splice_read) { @@ -4325,7 +6914,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, trace_access_lock(iter->cpu_file); /* Fill as many pages as possible. */ - for (i = 0, rem = len; i < pipe->buffers && rem; i++) { + for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) { spd.pages[i] = alloc_page(GFP_KERNEL); if (!spd.pages[i]) break; @@ -4335,13 +6924,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, /* Copy the data into the page, so we can start over. */ ret = trace_seq_to_buffer(&iter->seq, page_address(spd.pages[i]), - iter->seq.len); + min((size_t)trace_seq_used(&iter->seq), + (size_t)PAGE_SIZE)); if (ret < 0) { __free_page(spd.pages[i]); break; } spd.partial[i].offset = 0; - spd.partial[i].len = iter->seq.len; + spd.partial[i].len = ret; trace_seq_init(&iter->seq); } @@ -4352,7 +6942,10 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, spd.nr_pages = i; - ret = splice_to_pipe(pipe, &spd); + if (i) + ret = splice_to_pipe(pipe, &spd); + else + ret = 0; out: splice_shrink_spd(&spd); return ret; @@ -4363,18 +6956,56 @@ out_err: } static ssize_t +tracing_syscall_buf_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; + char buf[64]; + int r; + + r = snprintf(buf, 64, "%d\n", tr->syscall_buf_sz); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_syscall_buf_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + if (val > SYSCALL_FAULT_USER_MAX) + val = SYSCALL_FAULT_USER_MAX; + + tr->syscall_buf_sz = val; + + *ppos += cnt; + + return cnt; +} + +static ssize_t tracing_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_cpu *tc = filp->private_data; - struct trace_array *tr = tc->tr; + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; + int cpu = tracing_get_cpu(inode); char buf[64]; int r = 0; ssize_t ret; mutex_lock(&trace_types_lock); - if (tc->cpu == RING_BUFFER_ALL_CPUS) { + if (cpu == RING_BUFFER_ALL_CPUS) { int cpu, buf_size_same; unsigned long size; @@ -4384,15 +7015,15 @@ tracing_entries_read(struct file *filp, char __user *ubuf, for_each_tracing_cpu(cpu) { /* fill in the size from first enabled cpu */ if (size == 0) - size = per_cpu_ptr(tr->trace_buffer.data, cpu)->entries; - if (size != per_cpu_ptr(tr->trace_buffer.data, cpu)->entries) { + size = per_cpu_ptr(tr->array_buffer.data, cpu)->entries; + if (size != per_cpu_ptr(tr->array_buffer.data, cpu)->entries) { buf_size_same = 0; break; } } if (buf_size_same) { - if (!ring_buffer_expanded) + if (!tr->ring_buffer_expanded) r = sprintf(buf, "%lu (expanded: %lu)\n", size >> 10, trace_buf_size >> 10); @@ -4401,7 +7032,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf, } else r = sprintf(buf, "X\n"); } else - r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, tc->cpu)->entries >> 10); + r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10); mutex_unlock(&trace_types_lock); @@ -4413,7 +7044,8 @@ static ssize_t tracing_entries_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_cpu *tc = filp->private_data; + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; unsigned long val; int ret; @@ -4427,8 +7059,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, /* value is in KB */ val <<= 10; - - ret = tracing_resize_ring_buffer(tc->tr, val, tc->cpu); + ret = tracing_resize_ring_buffer(tr, val, tracing_get_cpu(inode)); if (ret < 0) return ret; @@ -4448,11 +7079,11 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, mutex_lock(&trace_types_lock); for_each_tracing_cpu(cpu) { - size += per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10; - if (!ring_buffer_expanded) + size += per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10; + if (!tr->ring_buffer_expanded) expanded_size += trace_buf_size >> 10; } - if (ring_buffer_expanded) + if (tr->ring_buffer_expanded) r = sprintf(buf, "%lu\n", size); else r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size); @@ -4461,6 +7092,120 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } +#define LAST_BOOT_HEADER ((void *)1) + +static void *l_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_array *tr = m->private; + struct trace_scratch *tscratch = tr->scratch; + unsigned int index = *pos; + + (*pos)++; + + if (*pos == 1) + return LAST_BOOT_HEADER; + + /* Only show offsets of the last boot data */ + if (!tscratch || !(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + return NULL; + + /* *pos 0 is for the header, 1 is for the first module */ + index--; + + if (index >= tscratch->nr_entries) + return NULL; + + return &tscratch->entries[index]; +} + +static void *l_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&scratch_mutex); + + return l_next(m, NULL, pos); +} + +static void l_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&scratch_mutex); +} + +static void show_last_boot_header(struct seq_file *m, struct trace_array *tr) +{ + struct trace_scratch *tscratch = tr->scratch; + + /* + * Do not leak KASLR address. This only shows the KASLR address of + * the last boot. When the ring buffer is started, the LAST_BOOT + * flag gets cleared, and this should only report "current". + * Otherwise it shows the KASLR address from the previous boot which + * should not be the same as the current boot. + */ + if (tscratch && (tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + seq_printf(m, "%lx\t[kernel]\n", tscratch->text_addr); + else + seq_puts(m, "# Current\n"); +} + +static int l_show(struct seq_file *m, void *v) +{ + struct trace_array *tr = m->private; + struct trace_mod_entry *entry = v; + + if (v == LAST_BOOT_HEADER) { + show_last_boot_header(m, tr); + return 0; + } + + seq_printf(m, "%lx\t%s\n", entry->mod_addr, entry->mod_name); + return 0; +} + +static const struct seq_operations last_boot_seq_ops = { + .start = l_start, + .next = l_next, + .stop = l_stop, + .show = l_show, +}; + +static int tracing_last_boot_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + struct seq_file *m; + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + ret = seq_open(file, &last_boot_seq_ops); + if (ret) { + trace_array_put(tr); + return ret; + } + + m = file->private_data; + m->private = tr; + + return 0; +} + +static int tracing_buffer_meta_open(struct inode *inode, struct file *filp) +{ + struct trace_array *tr = inode->i_private; + int cpu = tracing_get_cpu(inode); + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + ret = ring_buffer_meta_seq_init(filp, tr->array_buffer.buffer, cpu); + if (ret < 0) + __trace_array_put(tr); + return ret; +} + static ssize_t tracing_free_buffer_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -4481,8 +7226,8 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) struct trace_array *tr = inode->i_private; /* disable tracing ? */ - if (trace_flags & TRACE_ITER_STOP_ON_FREE) - tracing_off(); + if (tr->trace_flags & TRACE_ITER(STOP_ON_FREE)) + tracer_tracing_off(tr); /* resize the ring buffer to 0 */ tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); @@ -4491,111 +7236,510 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) return 0; } -static ssize_t -tracing_mark_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *fpos) +#define TRACE_MARKER_MAX_SIZE 4096 + +static ssize_t write_marker_to_buffer(struct trace_array *tr, const char *buf, + size_t cnt, unsigned long ip) { - unsigned long addr = (unsigned long)ubuf; - struct trace_array *tr = filp->private_data; struct ring_buffer_event *event; - struct ring_buffer *buffer; + enum event_trigger_type tt = ETT_NONE; + struct trace_buffer *buffer; struct print_entry *entry; - unsigned long irq_flags; - struct page *pages[2]; - void *map_page[2]; - int nr_pages = 1; + int meta_size; ssize_t written; - int offset; - int size; - int len; + size_t size; + + meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */ + again: + size = cnt + meta_size; + + buffer = tr->array_buffer.buffer; + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + tracing_gen_ctx()); + if (unlikely(!event)) { + /* + * If the size was greater than what was allowed, then + * make it smaller and try again. + */ + if (size > ring_buffer_max_event_size(buffer)) { + cnt = ring_buffer_max_event_size(buffer) - meta_size; + /* The above should only happen once */ + if (WARN_ON_ONCE(cnt + meta_size == size)) + return -EBADF; + goto again; + } + + /* Ring buffer disabled, return as if not open for write */ + return -EBADF; + } + + entry = ring_buffer_event_data(event); + entry->ip = ip; + memcpy(&entry->buf, buf, cnt); + written = cnt; + + if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) { + /* do not add \n before testing triggers, but add \0 */ + entry->buf[cnt] = '\0'; + tt = event_triggers_call(tr->trace_marker_file, buffer, entry, event); + } + + if (entry->buf[cnt - 1] != '\n') { + entry->buf[cnt] = '\n'; + entry->buf[cnt + 1] = '\0'; + } else + entry->buf[cnt] = '\0'; + + if (static_branch_unlikely(&trace_marker_exports_enabled)) + ftrace_exports(event, TRACE_EXPORT_MARKER); + __buffer_unlock_commit(buffer, event); + + if (tt) + event_triggers_post_call(tr->trace_marker_file, tt); + + return written; +} + +struct trace_user_buf { + char *buf; +}; + +static DEFINE_MUTEX(trace_user_buffer_mutex); +static struct trace_user_buf_info *trace_user_buffer; + +/** + * trace_user_fault_destroy - free up allocated memory of a trace user buffer + * @tinfo: The descriptor to free up + * + * Frees any data allocated in the trace info dsecriptor. + */ +void trace_user_fault_destroy(struct trace_user_buf_info *tinfo) +{ + char *buf; + int cpu; + + if (!tinfo || !tinfo->tbuf) + return; + + for_each_possible_cpu(cpu) { + buf = per_cpu_ptr(tinfo->tbuf, cpu)->buf; + kfree(buf); + } + free_percpu(tinfo->tbuf); +} + +static int user_fault_buffer_enable(struct trace_user_buf_info *tinfo, size_t size) +{ + char *buf; + int cpu; + + lockdep_assert_held(&trace_user_buffer_mutex); + + tinfo->tbuf = alloc_percpu(struct trace_user_buf); + if (!tinfo->tbuf) + return -ENOMEM; + + tinfo->ref = 1; + tinfo->size = size; + + /* Clear each buffer in case of error */ + for_each_possible_cpu(cpu) { + per_cpu_ptr(tinfo->tbuf, cpu)->buf = NULL; + } + + for_each_possible_cpu(cpu) { + buf = kmalloc_node(size, GFP_KERNEL, + cpu_to_node(cpu)); + if (!buf) + return -ENOMEM; + per_cpu_ptr(tinfo->tbuf, cpu)->buf = buf; + } + + return 0; +} + +/* For internal use. Free and reinitialize */ +static void user_buffer_free(struct trace_user_buf_info **tinfo) +{ + lockdep_assert_held(&trace_user_buffer_mutex); + + trace_user_fault_destroy(*tinfo); + kfree(*tinfo); + *tinfo = NULL; +} + +/* For internal use. Initialize and allocate */ +static int user_buffer_init(struct trace_user_buf_info **tinfo, size_t size) +{ + bool alloc = false; int ret; - int i; - if (tracing_disabled) - return -EINVAL; + lockdep_assert_held(&trace_user_buffer_mutex); + + if (!*tinfo) { + alloc = true; + *tinfo = kzalloc(sizeof(**tinfo), GFP_KERNEL); + if (!*tinfo) + return -ENOMEM; + } + + ret = user_fault_buffer_enable(*tinfo, size); + if (ret < 0 && alloc) + user_buffer_free(tinfo); + + return ret; +} + +/* For internal use, derefrence and free if necessary */ +static void user_buffer_put(struct trace_user_buf_info **tinfo) +{ + guard(mutex)(&trace_user_buffer_mutex); + + if (WARN_ON_ONCE(!*tinfo || !(*tinfo)->ref)) + return; + + if (--(*tinfo)->ref) + return; + + user_buffer_free(tinfo); +} + +/** + * trace_user_fault_init - Allocated or reference a per CPU buffer + * @tinfo: A pointer to the trace buffer descriptor + * @size: The size to allocate each per CPU buffer + * + * Create a per CPU buffer that can be used to copy from user space + * in a task context. When calling trace_user_fault_read(), preemption + * must be disabled, and it will enable preemption and copy user + * space data to the buffer. If any schedule switches occur, it will + * retry until it succeeds without a schedule switch knowing the buffer + * is still valid. + * + * Returns 0 on success, negative on failure. + */ +int trace_user_fault_init(struct trace_user_buf_info *tinfo, size_t size) +{ + int ret; - if (!(trace_flags & TRACE_ITER_MARKERS)) + if (!tinfo) return -EINVAL; - if (cnt > TRACE_BUF_SIZE) - cnt = TRACE_BUF_SIZE; + guard(mutex)(&trace_user_buffer_mutex); + + ret = user_buffer_init(&tinfo, size); + if (ret < 0) + trace_user_fault_destroy(tinfo); + + return ret; +} + +/** + * trace_user_fault_get - up the ref count for the user buffer + * @tinfo: A pointer to a pointer to the trace buffer descriptor + * + * Ups the ref count of the trace buffer. + * + * Returns the new ref count. + */ +int trace_user_fault_get(struct trace_user_buf_info *tinfo) +{ + if (!tinfo) + return -1; + + guard(mutex)(&trace_user_buffer_mutex); + + tinfo->ref++; + return tinfo->ref; +} + +/** + * trace_user_fault_put - dereference a per cpu trace buffer + * @tinfo: The @tinfo that was passed to trace_user_fault_get() + * + * Decrement the ref count of @tinfo. + * + * Returns the new refcount (negative on error). + */ +int trace_user_fault_put(struct trace_user_buf_info *tinfo) +{ + guard(mutex)(&trace_user_buffer_mutex); + + if (WARN_ON_ONCE(!tinfo || !tinfo->ref)) + return -1; + + --tinfo->ref; + return tinfo->ref; +} + +/** + * trace_user_fault_read - Read user space into a per CPU buffer + * @tinfo: The @tinfo allocated by trace_user_fault_get() + * @ptr: The user space pointer to read + * @size: The size of user space to read. + * @copy_func: Optional function to use to copy from user space + * @data: Data to pass to copy_func if it was supplied + * + * Preemption must be disabled when this is called, and must not + * be enabled while using the returned buffer. + * This does the copying from user space into a per CPU buffer. + * + * The @size must not be greater than the size passed in to + * trace_user_fault_init(). + * + * If @copy_func is NULL, trace_user_fault_read() will use copy_from_user(), + * otherwise it will call @copy_func. It will call @copy_func with: + * + * buffer: the per CPU buffer of the @tinfo. + * ptr: The pointer @ptr to user space to read + * size: The @size of the ptr to read + * data: The @data parameter + * + * It is expected that @copy_func will return 0 on success and non zero + * if there was a fault. + * + * Returns a pointer to the buffer with the content read from @ptr. + * Preemption must remain disabled while the caller accesses the + * buffer returned by this function. + * Returns NULL if there was a fault, or the size passed in is + * greater than the size passed to trace_user_fault_init(). + */ +char *trace_user_fault_read(struct trace_user_buf_info *tinfo, + const char __user *ptr, size_t size, + trace_user_buf_copy copy_func, void *data) +{ + int cpu = smp_processor_id(); + char *buffer = per_cpu_ptr(tinfo->tbuf, cpu)->buf; + unsigned int cnt; + int trys = 0; + int ret; + + lockdep_assert_preemption_disabled(); /* - * Userspace is injecting traces into the kernel trace buffer. - * We want to be as non intrusive as possible. - * To do so, we do not want to allocate any special buffers - * or take any locks, but instead write the userspace data - * straight into the ring buffer. - * - * First we need to pin the userspace buffer into memory, - * which, most likely it is, because it just referenced it. - * But there's no guarantee that it is. By using get_user_pages_fast() - * and kmap_atomic/kunmap_atomic() we can get access to the - * pages directly. We then write the data directly into the - * ring buffer. + * It's up to the caller to not try to copy more than it said + * it would. */ - BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); + if (size > tinfo->size) + return NULL; - /* check if we cross pages */ - if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK)) - nr_pages = 2; + /* + * This acts similar to a seqcount. The per CPU context switches are + * recorded, migration is disabled and preemption is enabled. The + * read of the user space memory is copied into the per CPU buffer. + * Preemption is disabled again, and if the per CPU context switches count + * is still the same, it means the buffer has not been corrupted. + * If the count is different, it is assumed the buffer is corrupted + * and reading must be tried again. + */ - offset = addr & (PAGE_SIZE - 1); - addr &= PAGE_MASK; + do { + /* + * If for some reason, copy_from_user() always causes a context + * switch, this would then cause an infinite loop. + * If this task is preempted by another user space task, it + * will cause this task to try again. But just in case something + * changes where the copying from user space causes another task + * to run, prevent this from going into an infinite loop. + * 100 tries should be plenty. + */ + if (WARN_ONCE(trys++ > 100, "Error: Too many tries to read user space")) + return NULL; - ret = get_user_pages_fast(addr, nr_pages, 0, pages); - if (ret < nr_pages) { - while (--ret >= 0) - put_page(pages[ret]); - written = -EFAULT; - goto out; + /* Read the current CPU context switch counter */ + cnt = nr_context_switches_cpu(cpu); + + /* + * Preemption is going to be enabled, but this task must + * remain on this CPU. + */ + migrate_disable(); + + /* + * Now preemption is being enabled and another task can come in + * and use the same buffer and corrupt our data. + */ + preempt_enable_notrace(); + + /* Make sure preemption is enabled here */ + lockdep_assert_preemption_enabled(); + + if (copy_func) { + ret = copy_func(buffer, ptr, size, data); + } else { + ret = __copy_from_user(buffer, ptr, size); + } + + preempt_disable_notrace(); + migrate_enable(); + + /* if it faulted, no need to test if the buffer was corrupted */ + if (ret) + return NULL; + + /* + * Preemption is disabled again, now check the per CPU context + * switch counter. If it doesn't match, then another user space + * process may have schedule in and corrupted our buffer. In that + * case the copying must be retried. + */ + } while (nr_context_switches_cpu(cpu) != cnt); + + return buffer; +} + +static ssize_t +tracing_mark_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + struct trace_array *tr = filp->private_data; + ssize_t written = -ENODEV; + unsigned long ip; + char *buf; + + if (tracing_disabled) + return -EINVAL; + + if (!(tr->trace_flags & TRACE_ITER(MARKERS))) + return -EINVAL; + + if ((ssize_t)cnt < 0) + return -EINVAL; + + if (cnt > TRACE_MARKER_MAX_SIZE) + cnt = TRACE_MARKER_MAX_SIZE; + + /* Must have preemption disabled while having access to the buffer */ + guard(preempt_notrace)(); + + buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, NULL, NULL); + if (!buf) + return -EFAULT; + + /* The selftests expect this function to be the IP address */ + ip = _THIS_IP_; + + /* The global trace_marker can go to multiple instances */ + if (tr == &global_trace) { + guard(rcu)(); + list_for_each_entry_rcu(tr, &marker_copies, marker_list) { + written = write_marker_to_buffer(tr, buf, cnt, ip); + if (written < 0) + break; + } + } else { + written = write_marker_to_buffer(tr, buf, cnt, ip); } - for (i = 0; i < nr_pages; i++) - map_page[i] = kmap_atomic(pages[i]); + return written; +} - local_save_flags(irq_flags); - size = sizeof(*entry) + cnt + 2; /* possible \n added */ - buffer = tr->trace_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - irq_flags, preempt_count()); - if (!event) { +static ssize_t write_raw_marker_to_buffer(struct trace_array *tr, + const char *buf, size_t cnt) +{ + struct ring_buffer_event *event; + struct trace_buffer *buffer; + struct raw_data_entry *entry; + ssize_t written; + size_t size; + + /* cnt includes both the entry->id and the data behind it. */ + size = struct_offset(entry, id) + cnt; + + buffer = tr->array_buffer.buffer; + + if (size > ring_buffer_max_event_size(buffer)) + return -EINVAL; + + event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size, + tracing_gen_ctx()); + if (!event) /* Ring buffer disabled, return as if not open for write */ - written = -EBADF; - goto out_unlock; - } + return -EBADF; entry = ring_buffer_event_data(event); - entry->ip = _THIS_IP_; + unsafe_memcpy(&entry->id, buf, cnt, + "id and content already reserved on ring buffer" + "'buf' includes the 'id' and the data." + "'entry' was allocated with cnt from 'id'."); + written = cnt; - if (nr_pages == 2) { - len = PAGE_SIZE - offset; - memcpy(&entry->buf, map_page[0] + offset, len); - memcpy(&entry->buf[len], map_page[1], cnt - len); - } else - memcpy(&entry->buf, map_page[0] + offset, cnt); + __buffer_unlock_commit(buffer, event); - if (entry->buf[cnt - 1] != '\n') { - entry->buf[cnt] = '\n'; - entry->buf[cnt + 1] = '\0'; - } else - entry->buf[cnt] = '\0'; + return written; +} - __buffer_unlock_commit(buffer, event); +static ssize_t +tracing_mark_raw_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + struct trace_array *tr = filp->private_data; + ssize_t written = -ENODEV; + char *buf; - written = cnt; + if (tracing_disabled) + return -EINVAL; - *fpos += written; + if (!(tr->trace_flags & TRACE_ITER(MARKERS))) + return -EINVAL; - out_unlock: - for (i = 0; i < nr_pages; i++){ - kunmap_atomic(map_page[i]); - put_page(pages[i]); + /* The marker must at least have a tag id */ + if (cnt < sizeof(unsigned int)) + return -EINVAL; + + /* raw write is all or nothing */ + if (cnt > TRACE_MARKER_MAX_SIZE) + return -EINVAL; + + /* Must have preemption disabled while having access to the buffer */ + guard(preempt_notrace)(); + + buf = trace_user_fault_read(trace_user_buffer, ubuf, cnt, NULL, NULL); + if (!buf) + return -EFAULT; + + /* The global trace_marker_raw can go to multiple instances */ + if (tr == &global_trace) { + guard(rcu)(); + list_for_each_entry_rcu(tr, &marker_copies, marker_list) { + written = write_raw_marker_to_buffer(tr, buf, cnt); + if (written < 0) + break; + } + } else { + written = write_raw_marker_to_buffer(tr, buf, cnt); } - out: + return written; } +static int tracing_mark_open(struct inode *inode, struct file *filp) +{ + int ret; + + scoped_guard(mutex, &trace_user_buffer_mutex) { + if (!trace_user_buffer) { + ret = user_buffer_init(&trace_user_buffer, TRACE_MARKER_MAX_SIZE); + if (ret < 0) + return ret; + } else { + trace_user_buffer->ref++; + } + } + + stream_open(inode, filp); + ret = tracing_open_generic_tr(inode, filp); + if (ret < 0) + user_buffer_put(&trace_user_buffer); + return ret; +} + +static int tracing_mark_release(struct inode *inode, struct file *file) +{ + user_buffer_put(&trace_user_buffer); + return tracing_release_generic_tr(inode, file); +} + static int tracing_clock_show(struct seq_file *m, void *v) { struct trace_array *tr = m->private; @@ -4611,25 +7755,10 @@ static int tracing_clock_show(struct seq_file *m, void *v) return 0; } -static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *fpos) +int tracing_set_clock(struct trace_array *tr, const char *clockstr) { - struct seq_file *m = filp->private_data; - struct trace_array *tr = m->private; - char buf[64]; - const char *clockstr; int i; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - clockstr = strstrip(buf); - for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) { if (strcmp(trace_clocks[i].name, clockstr) == 0) break; @@ -4637,25 +7766,55 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, if (i == ARRAY_SIZE(trace_clocks)) return -EINVAL; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); tr->clock_id = i; - ring_buffer_set_clock(tr->trace_buffer.buffer, trace_clocks[i].func); + ring_buffer_set_clock(tr->array_buffer.buffer, trace_clocks[i].func); /* * New clock may not be consistent with the previous clock. * Reset the buffer so that it doesn't have incomparable timestamps. */ - tracing_reset_online_cpus(&global_trace.trace_buffer); + tracing_reset_online_cpus(&tr->array_buffer); #ifdef CONFIG_TRACER_MAX_TRACE - if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer) + if (tr->max_buffer.buffer) ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); - tracing_reset_online_cpus(&global_trace.max_buffer); + tracing_reset_online_cpus(&tr->max_buffer); #endif - mutex_unlock(&trace_types_lock); + if (tr->scratch && !(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) { + struct trace_scratch *tscratch = tr->scratch; + + tscratch->clock_id = i; + } + + return 0; +} + +static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; + char buf[64]; + const char *clockstr; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + clockstr = strstrip(buf); + + ret = tracing_set_clock(tr, clockstr); + if (ret) + return ret; *fpos += cnt; @@ -4667,11 +7826,9 @@ static int tracing_clock_open(struct inode *inode, struct file *file) struct trace_array *tr = inode->i_private; int ret; - if (tracing_disabled) - return -ENODEV; - - if (trace_array_get(tr)) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; ret = single_open(file, tracing_clock_show, inode->i_private); if (ret < 0) @@ -4680,51 +7837,119 @@ static int tracing_clock_open(struct inode *inode, struct file *file) return ret; } +static int tracing_time_stamp_mode_show(struct seq_file *m, void *v) +{ + struct trace_array *tr = m->private; + + guard(mutex)(&trace_types_lock); + + if (ring_buffer_time_stamp_abs(tr->array_buffer.buffer)) + seq_puts(m, "delta [absolute]\n"); + else + seq_puts(m, "[delta] absolute\n"); + + return 0; +} + +static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private); + if (ret < 0) + trace_array_put(tr); + + return ret; +} + +u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe) +{ + if (rbe == this_cpu_read(trace_buffered_event)) + return ring_buffer_time_stamp(buffer); + + return ring_buffer_event_time_stamp(buffer, rbe); +} + +/* + * Set or disable using the per CPU trace_buffer_event when possible. + */ +int tracing_set_filter_buffering(struct trace_array *tr, bool set) +{ + guard(mutex)(&trace_types_lock); + + if (set && tr->no_filter_buffering_ref++) + return 0; + + if (!set) { + if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) + return -EINVAL; + + --tr->no_filter_buffering_ref; + } + + return 0; +} + struct ftrace_buffer_info { struct trace_iterator iter; void *spare; + unsigned int spare_cpu; + unsigned int spare_size; unsigned int read; }; #ifdef CONFIG_TRACER_SNAPSHOT static int tracing_snapshot_open(struct inode *inode, struct file *file) { - struct trace_cpu *tc = inode->i_private; - struct trace_array *tr = tc->tr; + struct trace_array *tr = inode->i_private; struct trace_iterator *iter; struct seq_file *m; - int ret = 0; + int ret; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; if (file->f_mode & FMODE_READ) { - iter = __tracing_open(tr, tc, inode, file, true); + iter = __tracing_open(inode, file, true); if (IS_ERR(iter)) ret = PTR_ERR(iter); } else { /* Writes still need the seq_file to hold the private data */ + ret = -ENOMEM; m = kzalloc(sizeof(*m), GFP_KERNEL); if (!m) - return -ENOMEM; + goto out; iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { kfree(m); - return -ENOMEM; + goto out; } + ret = 0; + iter->tr = tr; - iter->trace_buffer = &tc->tr->max_buffer; - iter->cpu_file = tc->cpu; + iter->array_buffer = &tr->max_buffer; + iter->cpu_file = tracing_get_cpu(inode); m->private = iter; file->private_data = m; } - +out: if (ret < 0) trace_array_put(tr); return ret; } +static void tracing_swap_cpu_buffer(void *tr) +{ + update_max_tr_single((struct trace_array *)tr, current, smp_processor_id()); +} + static ssize_t tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -4735,7 +7960,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, unsigned long val; int ret; - ret = tracing_update_buffers(); + ret = tracing_update_buffers(tr); if (ret < 0) return ret; @@ -4743,49 +7968,58 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret) return ret; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); - if (tr->current_trace->use_max_tr) { + if (tr->current_trace->use_max_tr) + return -EBUSY; + + local_irq_disable(); + arch_spin_lock(&tr->max_lock); + if (tr->cond_snapshot) ret = -EBUSY; - goto out; - } + arch_spin_unlock(&tr->max_lock); + local_irq_enable(); + if (ret) + return ret; switch (val) { case 0: - if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { - ret = -EINVAL; - break; - } + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) + return -EINVAL; if (tr->allocated_snapshot) free_snapshot(tr); break; case 1: /* Only allow per-cpu swap if the ring buffer supports it */ #ifndef CONFIG_RING_BUFFER_ALLOW_SWAP - if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { - ret = -EINVAL; - break; - } + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) + return -EINVAL; #endif - if (!tr->allocated_snapshot) { - ret = alloc_snapshot(tr); - if (ret < 0) - break; - } - local_irq_disable(); + if (tr->allocated_snapshot) + ret = resize_buffer_duplicate_size(&tr->max_buffer, + &tr->array_buffer, iter->cpu_file); + + ret = tracing_arm_snapshot_locked(tr); + if (ret) + return ret; + /* Now, we're going to swap */ - if (iter->cpu_file == RING_BUFFER_ALL_CPUS) - update_max_tr(tr, current, smp_processor_id()); - else - update_max_tr_single(tr, current, iter->cpu_file); - local_irq_enable(); + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { + local_irq_disable(); + update_max_tr(tr, current, smp_processor_id(), NULL); + local_irq_enable(); + } else { + smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer, + (void *)tr, 1); + } + tracing_disarm_snapshot(tr); break; default: if (tr->allocated_snapshot) { if (iter->cpu_file == RING_BUFFER_ALL_CPUS) tracing_reset_online_cpus(&tr->max_buffer); else - tracing_reset(&tr->max_buffer, iter->cpu_file); + tracing_reset_cpu(&tr->max_buffer, iter->cpu_file); } break; } @@ -4794,8 +8028,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, *ppos += cnt; ret = cnt; } -out: - mutex_unlock(&trace_types_lock); + return ret; } @@ -4829,6 +8062,7 @@ static int snapshot_raw_open(struct inode *inode, struct file *filp) struct ftrace_buffer_info *info; int ret; + /* The following checks for tracefs lockdown */ ret = tracing_buffers_open(inode, filp); if (ret < 0) return ret; @@ -4841,7 +8075,7 @@ static int snapshot_raw_open(struct inode *inode, struct file *filp) } info->iter.snapshot = true; - info->iter.trace_buffer = &info->iter.tr->max_buffer; + info->iter.array_buffer = &info->iter.tr->max_buffer; return ret; } @@ -4849,18 +8083,29 @@ static int snapshot_raw_open(struct inode *inode, struct file *filp) #endif /* CONFIG_TRACER_SNAPSHOT */ -static const struct file_operations tracing_max_lat_fops = { +static const struct file_operations tracing_thresh_fops = { .open = tracing_open_generic, + .read = tracing_thresh_read, + .write = tracing_thresh_write, + .llseek = generic_file_llseek, +}; + +#ifdef CONFIG_TRACER_MAX_TRACE +static const struct file_operations tracing_max_lat_fops = { + .open = tracing_open_generic_tr, .read = tracing_max_lat_read, .write = tracing_max_lat_write, .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, }; +#endif static const struct file_operations set_tracer_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .read = tracing_set_trace_read, .write = tracing_set_trace_write, .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, }; static const struct file_operations tracing_pipe_fops = { @@ -4869,15 +8114,29 @@ static const struct file_operations tracing_pipe_fops = { .read = tracing_read_pipe, .splice_read = tracing_splice_read_pipe, .release = tracing_release_pipe, - .llseek = no_llseek, }; static const struct file_operations tracing_entries_fops = { - .open = tracing_open_generic_tc, + .open = tracing_open_generic_tr, .read = tracing_entries_read, .write = tracing_entries_write, .llseek = generic_file_llseek, - .release = tracing_release_generic_tc, + .release = tracing_release_generic_tr, +}; + +static const struct file_operations tracing_syscall_buf_fops = { + .open = tracing_open_generic_tr, + .read = tracing_syscall_buf_read, + .write = tracing_syscall_buf_write, + .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, +}; + +static const struct file_operations tracing_buffer_meta_fops = { + .open = tracing_buffer_meta_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_seq_release, }; static const struct file_operations tracing_total_entries_fops = { @@ -4894,10 +8153,15 @@ static const struct file_operations tracing_free_buffer_fops = { }; static const struct file_operations tracing_mark_fops = { - .open = tracing_open_generic_tr, + .open = tracing_mark_open, .write = tracing_mark_write, - .llseek = generic_file_llseek, - .release = tracing_release_generic_tr, + .release = tracing_mark_release, +}; + +static const struct file_operations tracing_mark_raw_fops = { + .open = tracing_mark_open, + .write = tracing_mark_raw_write, + .release = tracing_mark_release, }; static const struct file_operations trace_clock_fops = { @@ -4908,12 +8172,26 @@ static const struct file_operations trace_clock_fops = { .write = tracing_clock_write, }; +static const struct file_operations trace_time_stamp_mode_fops = { + .open = tracing_time_stamp_mode_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_single_release_tr, +}; + +static const struct file_operations last_boot_fops = { + .open = tracing_last_boot_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_seq_release, +}; + #ifdef CONFIG_TRACER_SNAPSHOT static const struct file_operations snapshot_fops = { .open = tracing_snapshot_open, .read = seq_read, .write = tracing_snapshot_write, - .llseek = tracing_seek, + .llseek = tracing_lseek, .release = tracing_snapshot_release, }; @@ -4922,25 +8200,380 @@ static const struct file_operations snapshot_raw_fops = { .read = tracing_buffers_read, .release = tracing_buffers_release, .splice_read = tracing_buffers_splice_read, - .llseek = no_llseek, }; #endif /* CONFIG_TRACER_SNAPSHOT */ +/* + * trace_min_max_write - Write a u64 value to a trace_min_max_param struct + * @filp: The active open file structure + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * This function implements the write interface for a struct trace_min_max_param. + * The filp->private_data must point to a trace_min_max_param structure that + * defines where to write the value, the min and the max acceptable values, + * and a lock to protect the write. + */ +static ssize_t +trace_min_max_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_min_max_param *param = filp->private_data; + u64 val; + int err; + + if (!param) + return -EFAULT; + + err = kstrtoull_from_user(ubuf, cnt, 10, &val); + if (err) + return err; + + if (param->lock) + mutex_lock(param->lock); + + if (param->min && val < *param->min) + err = -EINVAL; + + if (param->max && val > *param->max) + err = -EINVAL; + + if (!err) + *param->val = val; + + if (param->lock) + mutex_unlock(param->lock); + + if (err) + return err; + + return cnt; +} + +/* + * trace_min_max_read - Read a u64 value from a trace_min_max_param struct + * @filp: The active open file structure + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * This function implements the read interface for a struct trace_min_max_param. + * The filp->private_data must point to a trace_min_max_param struct with valid + * data. + */ +static ssize_t +trace_min_max_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_min_max_param *param = filp->private_data; + char buf[U64_STR_SIZE]; + int len; + u64 val; + + if (!param) + return -EFAULT; + + val = *param->val; + + if (cnt > sizeof(buf)) + cnt = sizeof(buf); + + len = snprintf(buf, sizeof(buf), "%llu\n", val); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); +} + +const struct file_operations trace_min_max_fops = { + .open = tracing_open_generic, + .read = trace_min_max_read, + .write = trace_min_max_write, +}; + +#define TRACING_LOG_ERRS_MAX 8 +#define TRACING_LOG_LOC_MAX 128 + +#define CMD_PREFIX " Command: " + +struct err_info { + const char **errs; /* ptr to loc-specific array of err strings */ + u8 type; /* index into errs -> specific err string */ + u16 pos; /* caret position */ + u64 ts; +}; + +struct tracing_log_err { + struct list_head list; + struct err_info info; + char loc[TRACING_LOG_LOC_MAX]; /* err location */ + char *cmd; /* what caused err */ +}; + +static DEFINE_MUTEX(tracing_err_log_lock); + +static struct tracing_log_err *alloc_tracing_log_err(int len) +{ + struct tracing_log_err *err; + + err = kzalloc(sizeof(*err), GFP_KERNEL); + if (!err) + return ERR_PTR(-ENOMEM); + + err->cmd = kzalloc(len, GFP_KERNEL); + if (!err->cmd) { + kfree(err); + return ERR_PTR(-ENOMEM); + } + + return err; +} + +static void free_tracing_log_err(struct tracing_log_err *err) +{ + kfree(err->cmd); + kfree(err); +} + +static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr, + int len) +{ + struct tracing_log_err *err; + char *cmd; + + if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) { + err = alloc_tracing_log_err(len); + if (PTR_ERR(err) != -ENOMEM) + tr->n_err_log_entries++; + + return err; + } + cmd = kzalloc(len, GFP_KERNEL); + if (!cmd) + return ERR_PTR(-ENOMEM); + err = list_first_entry(&tr->err_log, struct tracing_log_err, list); + kfree(err->cmd); + err->cmd = cmd; + list_del(&err->list); + + return err; +} + +/** + * err_pos - find the position of a string within a command for error careting + * @cmd: The tracing command that caused the error + * @str: The string to position the caret at within @cmd + * + * Finds the position of the first occurrence of @str within @cmd. The + * return value can be passed to tracing_log_err() for caret placement + * within @cmd. + * + * Returns the index within @cmd of the first occurrence of @str or 0 + * if @str was not found. + */ +unsigned int err_pos(char *cmd, const char *str) +{ + char *found; + + if (WARN_ON(!strlen(cmd))) + return 0; + + found = strstr(cmd, str); + if (found) + return found - cmd; + + return 0; +} + +/** + * tracing_log_err - write an error to the tracing error log + * @tr: The associated trace array for the error (NULL for top level array) + * @loc: A string describing where the error occurred + * @cmd: The tracing command that caused the error + * @errs: The array of loc-specific static error strings + * @type: The index into errs[], which produces the specific static err string + * @pos: The position the caret should be placed in the cmd + * + * Writes an error into tracing/error_log of the form: + * + * <loc>: error: <text> + * Command: <cmd> + * ^ + * + * tracing/error_log is a small log file containing the last + * TRACING_LOG_ERRS_MAX errors (8). Memory for errors isn't allocated + * unless there has been a tracing error, and the error log can be + * cleared and have its memory freed by writing the empty string in + * truncation mode to it i.e. echo > tracing/error_log. + * + * NOTE: the @errs array along with the @type param are used to + * produce a static error string - this string is not copied and saved + * when the error is logged - only a pointer to it is saved. See + * existing callers for examples of how static strings are typically + * defined for use with tracing_log_err(). + */ +void tracing_log_err(struct trace_array *tr, + const char *loc, const char *cmd, + const char **errs, u8 type, u16 pos) +{ + struct tracing_log_err *err; + int len = 0; + + if (!tr) + tr = &global_trace; + + len += sizeof(CMD_PREFIX) + 2 * sizeof("\n") + strlen(cmd) + 1; + + guard(mutex)(&tracing_err_log_lock); + + err = get_tracing_log_err(tr, len); + if (PTR_ERR(err) == -ENOMEM) + return; + + snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc); + snprintf(err->cmd, len, "\n" CMD_PREFIX "%s\n", cmd); + + err->info.errs = errs; + err->info.type = type; + err->info.pos = pos; + err->info.ts = local_clock(); + + list_add_tail(&err->list, &tr->err_log); +} + +static void clear_tracing_err_log(struct trace_array *tr) +{ + struct tracing_log_err *err, *next; + + guard(mutex)(&tracing_err_log_lock); + + list_for_each_entry_safe(err, next, &tr->err_log, list) { + list_del(&err->list); + free_tracing_log_err(err); + } + + tr->n_err_log_entries = 0; +} + +static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos) +{ + struct trace_array *tr = m->private; + + mutex_lock(&tracing_err_log_lock); + + return seq_list_start(&tr->err_log, *pos); +} + +static void *tracing_err_log_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_array *tr = m->private; + + return seq_list_next(v, &tr->err_log, pos); +} + +static void tracing_err_log_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&tracing_err_log_lock); +} + +static void tracing_err_log_show_pos(struct seq_file *m, u16 pos) +{ + u16 i; + + for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++) + seq_putc(m, ' '); + for (i = 0; i < pos; i++) + seq_putc(m, ' '); + seq_puts(m, "^\n"); +} + +static int tracing_err_log_seq_show(struct seq_file *m, void *v) +{ + struct tracing_log_err *err = v; + + if (err) { + const char *err_text = err->info.errs[err->info.type]; + u64 sec = err->info.ts; + u32 nsec; + + nsec = do_div(sec, NSEC_PER_SEC); + seq_printf(m, "[%5llu.%06u] %s%s", sec, nsec / 1000, + err->loc, err_text); + seq_printf(m, "%s", err->cmd); + tracing_err_log_show_pos(m, err->info.pos); + } + + return 0; +} + +static const struct seq_operations tracing_err_log_seq_ops = { + .start = tracing_err_log_seq_start, + .next = tracing_err_log_seq_next, + .stop = tracing_err_log_seq_stop, + .show = tracing_err_log_seq_show +}; + +static int tracing_err_log_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + int ret = 0; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + /* If this file was opened for write, then erase contents */ + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) + clear_tracing_err_log(tr); + + if (file->f_mode & FMODE_READ) { + ret = seq_open(file, &tracing_err_log_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = tr; + } else { + trace_array_put(tr); + } + } + return ret; +} + +static ssize_t tracing_err_log_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + return count; +} + +static int tracing_err_log_release(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + + return 0; +} + +static const struct file_operations tracing_err_log_fops = { + .open = tracing_err_log_open, + .write = tracing_err_log_write, + .read = seq_read, + .llseek = tracing_lseek, + .release = tracing_err_log_release, +}; + static int tracing_buffers_open(struct inode *inode, struct file *filp) { - struct trace_cpu *tc = inode->i_private; - struct trace_array *tr = tc->tr; + struct trace_array *tr = inode->i_private; struct ftrace_buffer_info *info; int ret; - if (tracing_disabled) - return -ENODEV; - - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; - info = kzalloc(sizeof(*info), GFP_KERNEL); + info = kvzalloc(sizeof(*info), GFP_KERNEL); if (!info) { trace_array_put(tr); return -ENOMEM; @@ -4948,18 +8581,18 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) mutex_lock(&trace_types_lock); - tr->ref++; - info->iter.tr = tr; - info->iter.cpu_file = tc->cpu; + info->iter.cpu_file = tracing_get_cpu(inode); info->iter.trace = tr->current_trace; - info->iter.trace_buffer = &tr->trace_buffer; + info->iter.array_buffer = &tr->array_buffer; info->spare = NULL; /* Force reading ring buffer for first read */ info->read = (unsigned int)-1; filp->private_data = info; + tr->trace_ref++; + mutex_unlock(&trace_types_lock); ret = nonseekable_open(inode, filp); @@ -4969,7 +8602,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) return ret; } -static unsigned int +static __poll_t tracing_buffers_poll(struct file *filp, poll_table *poll_table) { struct ftrace_buffer_info *info = filp->private_data; @@ -4984,134 +8617,163 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, { struct ftrace_buffer_info *info = filp->private_data; struct trace_iterator *iter = &info->iter; - ssize_t ret; + void *trace_data; + int page_size; + ssize_t ret = 0; ssize_t size; if (!count) return 0; - mutex_lock(&trace_types_lock); - #ifdef CONFIG_TRACER_MAX_TRACE - if (iter->snapshot && iter->tr->current_trace->use_max_tr) { - size = -EBUSY; - goto out_unlock; - } + if (iter->snapshot && iter->tr->current_trace->use_max_tr) + return -EBUSY; #endif - if (!info->spare) - info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, + page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer); + + /* Make sure the spare matches the current sub buffer size */ + if (info->spare) { + if (page_size != info->spare_size) { + ring_buffer_free_read_page(iter->array_buffer->buffer, + info->spare_cpu, info->spare); + info->spare = NULL; + } + } + + if (!info->spare) { + info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer, iter->cpu_file); - size = -ENOMEM; + if (IS_ERR(info->spare)) { + ret = PTR_ERR(info->spare); + info->spare = NULL; + } else { + info->spare_cpu = iter->cpu_file; + info->spare_size = page_size; + } + } if (!info->spare) - goto out_unlock; + return ret; /* Do we have previous read data to read? */ - if (info->read < PAGE_SIZE) + if (info->read < page_size) goto read; again: trace_access_lock(iter->cpu_file); - ret = ring_buffer_read_page(iter->trace_buffer->buffer, - &info->spare, + ret = ring_buffer_read_page(iter->array_buffer->buffer, + info->spare, count, iter->cpu_file, 0); trace_access_unlock(iter->cpu_file); if (ret < 0) { - if (trace_empty(iter)) { - if ((filp->f_flags & O_NONBLOCK)) { - size = -EAGAIN; - goto out_unlock; - } - mutex_unlock(&trace_types_lock); - iter->trace->wait_pipe(iter); - mutex_lock(&trace_types_lock); - if (signal_pending(current)) { - size = -EINTR; - goto out_unlock; - } + if (trace_empty(iter) && !iter->closed) { + if (update_last_data_if_empty(iter->tr)) + return 0; + + if ((filp->f_flags & O_NONBLOCK)) + return -EAGAIN; + + ret = wait_on_pipe(iter, 0); + if (ret) + return ret; + goto again; } - size = 0; - goto out_unlock; + return 0; } info->read = 0; read: - size = PAGE_SIZE - info->read; + size = page_size - info->read; if (size > count) size = count; + trace_data = ring_buffer_read_page_data(info->spare); + ret = copy_to_user(ubuf, trace_data + info->read, size); + if (ret == size) + return -EFAULT; - ret = copy_to_user(ubuf, info->spare + info->read, size); - if (ret == size) { - size = -EFAULT; - goto out_unlock; - } size -= ret; *ppos += size; info->read += size; - out_unlock: - mutex_unlock(&trace_types_lock); - return size; } +static int tracing_buffers_flush(struct file *file, fl_owner_t id) +{ + struct ftrace_buffer_info *info = file->private_data; + struct trace_iterator *iter = &info->iter; + + iter->closed = true; + /* Make sure the waiters see the new wait_index */ + (void)atomic_fetch_inc_release(&iter->wait_index); + + ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); + + return 0; +} + static int tracing_buffers_release(struct inode *inode, struct file *file) { struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); + + iter->tr->trace_ref--; __trace_array_put(iter->tr); if (info->spare) - ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare); - kfree(info); - - mutex_unlock(&trace_types_lock); + ring_buffer_free_read_page(iter->array_buffer->buffer, + info->spare_cpu, info->spare); + kvfree(info); return 0; } struct buffer_ref { - struct ring_buffer *buffer; + struct trace_buffer *buffer; void *page; - int ref; + int cpu; + refcount_t refcount; }; +static void buffer_ref_release(struct buffer_ref *ref) +{ + if (!refcount_dec_and_test(&ref->refcount)) + return; + ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); + kfree(ref); +} + static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct buffer_ref *ref = (struct buffer_ref *)buf->private; - if (--ref->ref) - return; - - ring_buffer_free_read_page(ref->buffer, ref->page); - kfree(ref); + buffer_ref_release(ref); buf->private = 0; } -static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, +static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct buffer_ref *ref = (struct buffer_ref *)buf->private; - ref->ref++; + if (refcount_read(&ref->refcount) > INT_MAX/2) + return false; + + refcount_inc(&ref->refcount); + return true; } /* Pipe buffer operations for a buffer. */ static const struct pipe_buf_operations buffer_pipe_buf_ops = { - .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, - .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, - .steal = generic_pipe_buf_steal, .get = buffer_pipe_buf_get, }; @@ -5124,11 +8786,7 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) struct buffer_ref *ref = (struct buffer_ref *)spd->partial[i].private; - if (--ref->ref) - return; - - ring_buffer_free_read_page(ref->buffer, ref->page); - kfree(ref); + buffer_ref_release(ref); spd->partial[i].private = 0; } @@ -5145,87 +8803,77 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, .pages = pages_def, .partial = partial_def, .nr_pages_max = PIPE_DEF_BUFFERS, - .flags = flags, .ops = &buffer_pipe_buf_ops, .spd_release = buffer_spd_release, }; struct buffer_ref *ref; - int entries, size, i; - ssize_t ret; - - mutex_lock(&trace_types_lock); + bool woken = false; + int page_size; + int entries, i; + ssize_t ret = 0; #ifdef CONFIG_TRACER_MAX_TRACE - if (iter->snapshot && iter->tr->current_trace->use_max_tr) { - ret = -EBUSY; - goto out; - } + if (iter->snapshot && iter->tr->current_trace->use_max_tr) + return -EBUSY; #endif - if (splice_grow_spd(pipe, &spd)) { - ret = -ENOMEM; - goto out; - } + page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer); + if (*ppos & (page_size - 1)) + return -EINVAL; - if (*ppos & (PAGE_SIZE - 1)) { - ret = -EINVAL; - goto out; + if (len & (page_size - 1)) { + if (len < page_size) + return -EINVAL; + len &= (~(page_size - 1)); } - if (len & (PAGE_SIZE - 1)) { - if (len < PAGE_SIZE) { - ret = -EINVAL; - goto out; - } - len &= PAGE_MASK; - } + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; again: trace_access_lock(iter->cpu_file); - entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); + entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file); - for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) { + for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= page_size) { struct page *page; int r; ref = kzalloc(sizeof(*ref), GFP_KERNEL); - if (!ref) + if (!ref) { + ret = -ENOMEM; break; + } - ref->ref = 1; - ref->buffer = iter->trace_buffer->buffer; + refcount_set(&ref->refcount, 1); + ref->buffer = iter->array_buffer->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); - if (!ref->page) { + if (IS_ERR(ref->page)) { + ret = PTR_ERR(ref->page); + ref->page = NULL; kfree(ref); break; } + ref->cpu = iter->cpu_file; - r = ring_buffer_read_page(ref->buffer, &ref->page, + r = ring_buffer_read_page(ref->buffer, ref->page, len, iter->cpu_file, 1); if (r < 0) { - ring_buffer_free_read_page(ref->buffer, ref->page); + ring_buffer_free_read_page(ref->buffer, ref->cpu, + ref->page); kfree(ref); break; } - /* - * zero out any left over data, this is going to - * user land. - */ - size = ring_buffer_page_len(ref->page); - if (size < PAGE_SIZE) - memset(ref->page + size, 0, PAGE_SIZE - size); - - page = virt_to_page(ref->page); + page = virt_to_page(ring_buffer_read_page_data(ref->page)); spd.pages[i] = page; - spd.partial[i].len = PAGE_SIZE; + spd.partial[i].len = page_size; spd.partial[i].offset = 0; spd.partial[i].private = (unsigned long)ref; spd.nr_pages++; - *ppos += PAGE_SIZE; + *ppos += page_size; - entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); + entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file); } trace_access_unlock(iter->cpu_file); @@ -5233,24 +8881,154 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, /* did we read anything? */ if (!spd.nr_pages) { - if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) { - ret = -EAGAIN; + + if (ret) goto out; - } - mutex_unlock(&trace_types_lock); - iter->trace->wait_pipe(iter); - mutex_lock(&trace_types_lock); - if (signal_pending(current)) { - ret = -EINTR; + + if (woken) goto out; - } + + ret = -EAGAIN; + if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) + goto out; + + ret = wait_on_pipe(iter, iter->snapshot ? 0 : iter->tr->buffer_percent); + if (ret) + goto out; + + /* No need to wait after waking up when tracing is off */ + if (!tracer_tracing_is_on(iter->tr)) + goto out; + + /* Iterate one more time to collect any new data then exit */ + woken = true; + goto again; } ret = splice_to_pipe(pipe, &spd); - splice_shrink_spd(&spd); out: - mutex_unlock(&trace_types_lock); + splice_shrink_spd(&spd); + + return ret; +} + +static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct ftrace_buffer_info *info = file->private_data; + struct trace_iterator *iter = &info->iter; + int err; + + if (cmd == TRACE_MMAP_IOCTL_GET_READER) { + if (!(file->f_flags & O_NONBLOCK)) { + err = ring_buffer_wait(iter->array_buffer->buffer, + iter->cpu_file, + iter->tr->buffer_percent, + NULL, NULL); + if (err) + return err; + } + + return ring_buffer_map_get_reader(iter->array_buffer->buffer, + iter->cpu_file); + } else if (cmd) { + return -ENOTTY; + } + + /* + * An ioctl call with cmd 0 to the ring buffer file will wake up all + * waiters + */ + guard(mutex)(&trace_types_lock); + + /* Make sure the waiters see the new wait_index */ + (void)atomic_fetch_inc_release(&iter->wait_index); + + ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); + + return 0; +} + +#ifdef CONFIG_TRACER_MAX_TRACE +static int get_snapshot_map(struct trace_array *tr) +{ + int err = 0; + + /* + * Called with mmap_lock held. lockdep would be unhappy if we would now + * take trace_types_lock. Instead use the specific + * snapshot_trigger_lock. + */ + spin_lock(&tr->snapshot_trigger_lock); + + if (tr->snapshot || tr->mapped == UINT_MAX) + err = -EBUSY; + else + tr->mapped++; + + spin_unlock(&tr->snapshot_trigger_lock); + + /* Wait for update_max_tr() to observe iter->tr->mapped */ + if (tr->mapped == 1) + synchronize_rcu(); + + return err; + +} +static void put_snapshot_map(struct trace_array *tr) +{ + spin_lock(&tr->snapshot_trigger_lock); + if (!WARN_ON(!tr->mapped)) + tr->mapped--; + spin_unlock(&tr->snapshot_trigger_lock); +} +#else +static inline int get_snapshot_map(struct trace_array *tr) { return 0; } +static inline void put_snapshot_map(struct trace_array *tr) { } +#endif + +static void tracing_buffers_mmap_close(struct vm_area_struct *vma) +{ + struct ftrace_buffer_info *info = vma->vm_file->private_data; + struct trace_iterator *iter = &info->iter; + + WARN_ON(ring_buffer_unmap(iter->array_buffer->buffer, iter->cpu_file)); + put_snapshot_map(iter->tr); +} + +static int tracing_buffers_may_split(struct vm_area_struct *vma, unsigned long addr) +{ + /* + * Trace buffer mappings require the complete buffer including + * the meta page. Partial mappings are not supported. + */ + return -EINVAL; +} + +static const struct vm_operations_struct tracing_buffers_vmops = { + .close = tracing_buffers_mmap_close, + .may_split = tracing_buffers_may_split, +}; + +static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct ftrace_buffer_info *info = filp->private_data; + struct trace_iterator *iter = &info->iter; + int ret = 0; + + /* A memmap'ed and backup buffers are not supported for user space mmap */ + if (iter->tr->flags & (TRACE_ARRAY_FL_MEMMAP | TRACE_ARRAY_FL_VMALLOC)) + return -ENODEV; + + ret = get_snapshot_map(iter->tr); + if (ret) + return ret; + + ret = ring_buffer_map(iter->array_buffer->buffer, iter->cpu_file, vma); + if (ret) + put_snapshot_map(iter->tr); + + vma->vm_ops = &tracing_buffers_vmops; return ret; } @@ -5260,22 +9038,24 @@ static const struct file_operations tracing_buffers_fops = { .read = tracing_buffers_read, .poll = tracing_buffers_poll, .release = tracing_buffers_release, + .flush = tracing_buffers_flush, .splice_read = tracing_buffers_splice_read, - .llseek = no_llseek, + .unlocked_ioctl = tracing_buffers_ioctl, + .mmap = tracing_buffers_mmap, }; static ssize_t tracing_stats_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { - struct trace_cpu *tc = filp->private_data; - struct trace_array *tr = tc->tr; - struct trace_buffer *trace_buf = &tr->trace_buffer; + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; + struct array_buffer *trace_buf = &tr->array_buffer; + int cpu = tracing_get_cpu(inode); struct trace_seq *s; unsigned long cnt; unsigned long long t; unsigned long usec_rem; - int cpu = tc->cpu; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) @@ -5302,7 +9082,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); - t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu)); + t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer)); usec_rem = do_div(t, USEC_PER_SEC); trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); } else { @@ -5311,7 +9091,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); trace_seq_printf(s, "now ts: %llu\n", - ring_buffer_time_stamp(trace_buf->buffer, cpu)); + ring_buffer_time_stamp(trace_buf->buffer)); } cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu); @@ -5320,7 +9100,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "read events: %ld\n", cnt); - count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); + count = simple_read_from_buffer(ubuf, count, ppos, + s->buffer, trace_seq_used(s)); kfree(s); @@ -5328,40 +9109,42 @@ tracing_stats_read(struct file *filp, char __user *ubuf, } static const struct file_operations tracing_stats_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .read = tracing_stats_read, .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, }; #ifdef CONFIG_DYNAMIC_FTRACE -int __weak ftrace_arch_read_dyn_info(char *buf, int size) -{ - return 0; -} - static ssize_t tracing_read_dyn_info(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - static char ftrace_dyn_info_buffer[1024]; - static DEFINE_MUTEX(dyn_info_mutex); - unsigned long *p = filp->private_data; - char *buf = ftrace_dyn_info_buffer; - int size = ARRAY_SIZE(ftrace_dyn_info_buffer); + ssize_t ret; + char *buf; int r; - mutex_lock(&dyn_info_mutex); - r = sprintf(buf, "%ld ", *p); - - r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r); - buf[r++] = '\n'; + /* 512 should be plenty to hold the amount needed */ +#define DYN_INFO_BUF_SIZE 512 - r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + buf = kmalloc(DYN_INFO_BUF_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; - mutex_unlock(&dyn_info_mutex); + r = scnprintf(buf, DYN_INFO_BUF_SIZE, + "%ld pages:%ld groups: %ld\n" + "ftrace boot update time = %llu (ns)\n" + "ftrace module total update time = %llu (ns)\n", + ftrace_update_tot_cnt, + ftrace_number_of_pages, + ftrace_number_of_groups, + ftrace_update_time, + ftrace_total_mod_time); - return r; + ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + kfree(buf); + return ret; } static const struct file_operations tracing_dyn_info_fops = { @@ -5373,43 +9156,89 @@ static const struct file_operations tracing_dyn_info_fops = { #if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) static void -ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_snapshot(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - tracing_snapshot(); + tracing_snapshot_instance(tr); } static void -ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - unsigned long *count = (long *)data; + struct ftrace_func_mapper *mapper = data; + long *count = NULL; - if (!*count) - return; + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + + if (count) { + + if (*count <= 0) + return; - if (*count != -1) (*count)--; + } - tracing_snapshot(); + tracing_snapshot_instance(tr); } static int ftrace_snapshot_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - long count = (long)data; + struct ftrace_func_mapper *mapper = data; + long *count = NULL; seq_printf(m, "%ps:", (void *)ip); - seq_printf(m, "snapshot"); + seq_puts(m, "snapshot"); + + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); - if (count == -1) - seq_printf(m, ":unlimited\n"); + if (count) + seq_printf(m, ":count=%ld\n", *count); else - seq_printf(m, ":count=%ld\n", count); + seq_puts(m, ":unlimited\n"); return 0; } +static int +ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *init_data, void **data) +{ + struct ftrace_func_mapper *mapper = *data; + + if (!mapper) { + mapper = allocate_ftrace_func_mapper(); + if (!mapper) + return -ENOMEM; + *data = mapper; + } + + return ftrace_func_mapper_add_ip(mapper, ip, init_data); +} + +static void +ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *data) +{ + struct ftrace_func_mapper *mapper = data; + + if (!ip) { + if (!mapper) + return; + free_ftrace_func_mapper(mapper, NULL); + return; + } + + ftrace_func_mapper_remove_ip(mapper, ip); +} + static struct ftrace_probe_ops snapshot_probe_ops = { .func = ftrace_snapshot, .print = ftrace_snapshot_print, @@ -5418,10 +9247,12 @@ static struct ftrace_probe_ops snapshot_probe_ops = { static struct ftrace_probe_ops snapshot_count_probe_ops = { .func = ftrace_count_snapshot, .print = ftrace_snapshot_print, + .init = ftrace_snapshot_init, + .free = ftrace_snapshot_free, }; static int -ftrace_trace_snapshot_callback(struct ftrace_hash *hash, +ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; @@ -5429,6 +9260,9 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash, char *number; int ret; + if (!tr) + return -ENODEV; + /* hash funcs only work with set_ftrace_filter */ if (!enable) return -EINVAL; @@ -5436,8 +9270,11 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash, ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; if (glob[0] == '!') { - unregister_ftrace_function_probe_func(glob+1, ops); - return 0; + ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); + if (!ret) + tracing_disarm_snapshot(tr); + + return ret; } if (!param) @@ -5457,10 +9294,13 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash, return ret; out_reg: - ret = register_ftrace_function_probe(glob, ops, count); + ret = tracing_arm_snapshot(tr); + if (ret < 0) + return ret; - if (ret >= 0) - alloc_snapshot(&global_trace); + ret = register_ftrace_function_probe(glob, tr, ops, count); + if (ret < 0) + tracing_disarm_snapshot(tr); return ret < 0 ? ret : 0; } @@ -5470,36 +9310,27 @@ static struct ftrace_func_command ftrace_snapshot_cmd = { .func = ftrace_trace_snapshot_callback, }; -static int register_snapshot_cmd(void) +static __init int register_snapshot_cmd(void) { return register_ftrace_command(&ftrace_snapshot_cmd); } #else -static inline int register_snapshot_cmd(void) { return 0; } +static inline __init int register_snapshot_cmd(void) { return 0; } #endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ -struct dentry *tracing_init_dentry_tr(struct trace_array *tr) +static struct dentry *tracing_get_dentry(struct trace_array *tr) { - if (tr->dir) - return tr->dir; - - if (!debugfs_initialized()) - return NULL; - + /* Top directory uses NULL as the parent */ if (tr->flags & TRACE_ARRAY_FL_GLOBAL) - tr->dir = debugfs_create_dir("tracing", NULL); + return NULL; - if (!tr->dir) - pr_warn_once("Could not create debugfs directory 'tracing'\n"); + if (WARN_ON(!tr->dir)) + return ERR_PTR(-ENODEV); + /* All sub buffers have a descriptor */ return tr->dir; } -struct dentry *tracing_init_dentry(void) -{ - return tracing_init_dentry_tr(&global_trace); -} - static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) { struct dentry *d_tracer; @@ -5507,22 +9338,32 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) if (tr->percpu_dir) return tr->percpu_dir; - d_tracer = tracing_init_dentry_tr(tr); - if (!d_tracer) + d_tracer = tracing_get_dentry(tr); + if (IS_ERR(d_tracer)) return NULL; - tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer); + tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer); - WARN_ONCE(!tr->percpu_dir, - "Could not create debugfs directory 'per_cpu/%d'\n", cpu); + MEM_FAIL(!tr->percpu_dir, + "Could not create tracefs directory 'per_cpu/%d'\n", cpu); return tr->percpu_dir; } +static struct dentry * +trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, + void *data, long cpu, const struct file_operations *fops) +{ + struct dentry *ret = trace_create_file(name, mode, parent, data, fops); + + if (ret) /* See tracing_get_cpu() */ + d_inode(ret)->i_cdev = (void *)(cpu + 1); + return ret; +} + static void -tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) +tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) { - struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, cpu); struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); struct dentry *d_cpu; char cpu_dir[30]; /* 30 characters should be more than enough */ @@ -5531,35 +9372,40 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) return; snprintf(cpu_dir, 30, "cpu%ld", cpu); - d_cpu = debugfs_create_dir(cpu_dir, d_percpu); + d_cpu = tracefs_create_dir(cpu_dir, d_percpu); if (!d_cpu) { - pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); + pr_warn("Could not create tracefs '%s' entry\n", cpu_dir); return; } /* per cpu trace_pipe */ - trace_create_file("trace_pipe", 0444, d_cpu, - (void *)&data->trace_cpu, &tracing_pipe_fops); + trace_create_cpu_file("trace_pipe", TRACE_MODE_READ, d_cpu, + tr, cpu, &tracing_pipe_fops); /* per cpu trace */ - trace_create_file("trace", 0644, d_cpu, - (void *)&data->trace_cpu, &tracing_fops); + trace_create_cpu_file("trace", TRACE_MODE_WRITE, d_cpu, + tr, cpu, &tracing_fops); - trace_create_file("trace_pipe_raw", 0444, d_cpu, - (void *)&data->trace_cpu, &tracing_buffers_fops); + trace_create_cpu_file("trace_pipe_raw", TRACE_MODE_READ, d_cpu, + tr, cpu, &tracing_buffers_fops); - trace_create_file("stats", 0444, d_cpu, - (void *)&data->trace_cpu, &tracing_stats_fops); + trace_create_cpu_file("stats", TRACE_MODE_READ, d_cpu, + tr, cpu, &tracing_stats_fops); - trace_create_file("buffer_size_kb", 0444, d_cpu, - (void *)&data->trace_cpu, &tracing_entries_fops); + trace_create_cpu_file("buffer_size_kb", TRACE_MODE_READ, d_cpu, + tr, cpu, &tracing_entries_fops); + if (tr->range_addr_start) + trace_create_cpu_file("buffer_meta", TRACE_MODE_READ, d_cpu, + tr, cpu, &tracing_buffer_meta_fops); #ifdef CONFIG_TRACER_SNAPSHOT - trace_create_file("snapshot", 0644, d_cpu, - (void *)&data->trace_cpu, &snapshot_fops); + if (!tr->range_addr_start) { + trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu, + tr, cpu, &snapshot_fops); - trace_create_file("snapshot_raw", 0444, d_cpu, - (void *)&data->trace_cpu, &snapshot_raw_fops); + trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu, + tr, cpu, &snapshot_raw_fops); + } #endif } @@ -5568,13 +9414,6 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) #include "trace_selftest.c" #endif -struct trace_option_dentry { - struct tracer_opt *opt; - struct tracer_flags *flags; - struct trace_array *tr; - struct dentry *entry; -}; - static ssize_t trace_options_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -5606,10 +9445,9 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, return -EINVAL; if (!!(topt->flags->val & topt->opt->bit) != val) { - mutex_lock(&trace_types_lock); - ret = __set_tracer_option(topt->tr->current_trace, topt->flags, + guard(mutex)(&trace_types_lock); + ret = __set_tracer_option(topt->tr, topt->flags, topt->opt, !val); - mutex_unlock(&trace_types_lock); if (ret) return ret; } @@ -5619,22 +9457,80 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +static int tracing_open_options(struct inode *inode, struct file *filp) +{ + struct trace_option_dentry *topt = inode->i_private; + int ret; + + ret = tracing_check_open_get_tr(topt->tr); + if (ret) + return ret; + + filp->private_data = inode->i_private; + return 0; +} + +static int tracing_release_options(struct inode *inode, struct file *file) +{ + struct trace_option_dentry *topt = file->private_data; + + trace_array_put(topt->tr); + return 0; +} static const struct file_operations trace_options_fops = { - .open = tracing_open_generic, + .open = tracing_open_options, .read = trace_options_read, .write = trace_options_write, .llseek = generic_file_llseek, + .release = tracing_release_options, }; +/* + * In order to pass in both the trace_array descriptor as well as the index + * to the flag that the trace option file represents, the trace_array + * has a character array of trace_flags_index[], which holds the index + * of the bit for the flag it represents. index[0] == 0, index[1] == 1, etc. + * The address of this character array is passed to the flag option file + * read/write callbacks. + * + * In order to extract both the index and the trace_array descriptor, + * get_tr_index() uses the following algorithm. + * + * idx = *ptr; + * + * As the pointer itself contains the address of the index (remember + * index[1] == 1). + * + * Then to get the trace_array descriptor, by subtracting that index + * from the ptr, we get to the start of the index itself. + * + * ptr - idx == &index[0] + * + * Then a simple container_of() from that pointer gets us to the + * trace_array descriptor. + */ +static void get_tr_index(void *data, struct trace_array **ptr, + unsigned int *pindex) +{ + *pindex = *(unsigned char *)data; + + *ptr = container_of(data - *pindex, struct trace_array, + trace_flags_index); +} + static ssize_t trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - long index = (long)filp->private_data; + void *tr_index = filp->private_data; + struct trace_array *tr; + unsigned int index; char *buf; - if (trace_flags & (1 << index)) + get_tr_index(tr_index, &tr, &index); + + if (tr->trace_flags & (1ULL << index)) buf = "1\n"; else buf = "0\n"; @@ -5646,11 +9542,14 @@ static ssize_t trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_array *tr = &global_trace; - long index = (long)filp->private_data; + void *tr_index = filp->private_data; + struct trace_array *tr; + unsigned int index; unsigned long val; int ret; + get_tr_index(tr_index, &tr, &index); + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; @@ -5658,9 +9557,11 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, if (val != 0 && val != 1) return -EINVAL; + mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); - ret = set_tracer_flag(tr, 1 << index, val); + ret = set_tracer_flag(tr, 1ULL << index, val); mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); if (ret < 0) return ret; @@ -5685,9 +9586,9 @@ struct dentry *trace_create_file(const char *name, { struct dentry *ret; - ret = debugfs_create_file(name, mode, parent, data, fops); + ret = tracefs_create_file(name, mode, parent, data, fops); if (!ret) - pr_warning("Could not create debugfs '%s' entry\n", name); + pr_warn("Could not create tracefs '%s' entry\n", name); return ret; } @@ -5700,13 +9601,13 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr) if (tr->options) return tr->options; - d_tracer = tracing_init_dentry_tr(tr); - if (!d_tracer) + d_tracer = tracing_get_dentry(tr); + if (IS_ERR(d_tracer)) return NULL; - tr->options = debugfs_create_dir("options", d_tracer); + tr->options = tracefs_create_dir("options", d_tracer); if (!tr->options) { - pr_warning("Could not create debugfs directory 'options'\n"); + pr_warn("Could not create tracefs directory 'options'\n"); return NULL; } @@ -5729,26 +9630,21 @@ create_trace_option_file(struct trace_array *tr, topt->opt = opt; topt->tr = tr; - topt->entry = trace_create_file(opt->name, 0644, t_options, topt, - &trace_options_fops); - + topt->entry = trace_create_file(opt->name, TRACE_MODE_WRITE, + t_options, topt, &trace_options_fops); } -static struct trace_option_dentry * -create_trace_option_files(struct trace_array *tr, struct tracer *tracer) +static int +create_trace_option_files(struct trace_array *tr, struct tracer *tracer, + struct tracer_flags *flags) { struct trace_option_dentry *topts; - struct tracer_flags *flags; + struct trace_options *tr_topts; struct tracer_opt *opts; int cnt; - if (!tracer) - return NULL; - - flags = tracer->flags; - if (!flags || !flags->opts) - return NULL; + return 0; opts = flags->opts; @@ -5757,29 +9653,118 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer) topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL); if (!topts) - return NULL; + return 0; - for (cnt = 0; opts[cnt].name; cnt++) + tr_topts = krealloc(tr->topts, sizeof(*tr->topts) * (tr->nr_topts + 1), + GFP_KERNEL); + if (!tr_topts) { + kfree(topts); + return -ENOMEM; + } + + tr->topts = tr_topts; + tr->topts[tr->nr_topts].tracer = tracer; + tr->topts[tr->nr_topts].topts = topts; + tr->nr_topts++; + + for (cnt = 0; opts[cnt].name; cnt++) { create_trace_option_file(tr, &topts[cnt], flags, &opts[cnt]); + MEM_FAIL(topts[cnt].entry == NULL, + "Failed to create trace option: %s", + opts[cnt].name); + } + return 0; +} + +static int get_global_flags_val(struct tracer *tracer) +{ + struct tracers *t; - return topts; + list_for_each_entry(t, &global_trace.tracers, list) { + if (t->tracer != tracer) + continue; + if (!t->flags) + return -1; + return t->flags->val; + } + return -1; } -static void -destroy_trace_option_files(struct trace_option_dentry *topts) +static int add_tracer_options(struct trace_array *tr, struct tracers *t) { - int cnt; + struct tracer *tracer = t->tracer; + struct tracer_flags *flags = t->flags ?: tracer->flags; - if (!topts) - return; + if (!flags) + return 0; + + /* Only add tracer options after update_tracer_options finish */ + if (!tracer_options_updated) + return 0; + + return create_trace_option_files(tr, tracer, flags); +} + +static int add_tracer(struct trace_array *tr, struct tracer *tracer) +{ + struct tracer_flags *flags; + struct tracers *t; + int ret; + + /* Only enable if the directory has been created already. */ + if (!tr->dir && !(tr->flags & TRACE_ARRAY_FL_GLOBAL)) + return 0; + + /* + * If this is an instance, only create flags for tracers + * the instance may have. + */ + if (!trace_ok_for_array(tracer, tr)) + return 0; + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return -ENOMEM; + + t->tracer = tracer; + t->flags = NULL; + list_add(&t->list, &tr->tracers); + + flags = tracer->flags; + if (!flags) { + if (!tracer->default_flags) + return 0; + + /* + * If the tracer defines default flags, it means the flags are + * per trace instance. + */ + flags = kmalloc(sizeof(*flags), GFP_KERNEL); + if (!flags) + return -ENOMEM; + + *flags = *tracer->default_flags; + flags->trace = tracer; - for (cnt = 0; topts[cnt].opt; cnt++) { - if (topts[cnt].entry) - debugfs_remove(topts[cnt].entry); + t->flags = flags; + + /* If this is an instance, inherit the global_trace flags */ + if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) { + int val = get_global_flags_val(tracer); + if (!WARN_ON_ONCE(val < 0)) + flags->val = val; + } } - kfree(topts); + ret = add_tracer_options(tr, t); + if (ret < 0) { + list_del(&t->list); + kfree(t->flags); + kfree(t); + } + + return ret; } static struct dentry * @@ -5792,21 +9777,27 @@ create_trace_option_core_file(struct trace_array *tr, if (!t_options) return NULL; - return trace_create_file(option, 0644, t_options, (void *)index, - &trace_options_core_fops); + return trace_create_file(option, TRACE_MODE_WRITE, t_options, + (void *)&tr->trace_flags_index[index], + &trace_options_core_fops); } -static __init void create_trace_options_dir(struct trace_array *tr) +static void create_trace_options_dir(struct trace_array *tr) { struct dentry *t_options; + bool top_level = tr == &global_trace; int i; t_options = trace_options_init_dentry(tr); if (!t_options) return; - for (i = 0; trace_options[i]; i++) - create_trace_option_core_file(tr, trace_options[i], i); + for (i = 0; trace_options[i]; i++) { + if (top_level || + !((1ULL << i) & TOP_LEVEL_TRACE_FLAGS)) { + create_trace_option_core_file(tr, trace_options[i], i); + } + } } static ssize_t @@ -5828,7 +9819,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; - struct ring_buffer *buffer = tr->trace_buffer.buffer; + struct trace_buffer *buffer = tr->array_buffer.buffer; unsigned long val; int ret; @@ -5837,8 +9828,10 @@ rb_simple_write(struct file *filp, const char __user *ubuf, return ret; if (buffer) { - mutex_lock(&trace_types_lock); - if (val) { + guard(mutex)(&trace_types_lock); + if (!!val == tracer_tracing_is_on(tr)) { + val = 0; /* do nothing */ + } else if (val) { tracer_tracing_on(tr); if (tr->current_trace->start) tr->current_trace->start(tr); @@ -5846,8 +9839,9 @@ rb_simple_write(struct file *filp, const char __user *ubuf, tracer_tracing_off(tr); if (tr->current_trace->stop) tr->current_trace->stop(tr); + /* Wake up any waiters */ + ring_buffer_wake_waiters(buffer, RING_BUFFER_ALL_CPUS); } - mutex_unlock(&trace_types_lock); } (*ppos)++; @@ -5863,272 +9857,782 @@ static const struct file_operations rb_simple_fops = { .llseek = default_llseek, }; -struct dentry *trace_instance_dir; +static ssize_t +buffer_percent_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r; + + r = tr->buffer_percent; + r = sprintf(buf, "%d\n", r); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +buffer_percent_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + if (val > 100) + return -EINVAL; + + tr->buffer_percent = val; + + (*ppos)++; + + return cnt; +} + +static const struct file_operations buffer_percent_fops = { + .open = tracing_open_generic_tr, + .read = buffer_percent_read, + .write = buffer_percent_write, + .release = tracing_release_generic_tr, + .llseek = default_llseek, +}; + +static ssize_t +buffer_subbuf_size_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + size_t size; + char buf[64]; + int order; + int r; + + order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); + size = (PAGE_SIZE << order) / 1024; + + r = sprintf(buf, "%zd\n", size); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +buffer_subbuf_size_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + unsigned long val; + int old_order; + int order; + int pages; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + val *= 1024; /* value passed in is in KB */ + + pages = DIV_ROUND_UP(val, PAGE_SIZE); + order = fls(pages - 1); + + /* limit between 1 and 128 system pages */ + if (order < 0 || order > 7) + return -EINVAL; + + /* Do not allow tracing while changing the order of the ring buffer */ + tracing_stop_tr(tr); + + old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); + if (old_order == order) + goto out; + + ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, order); + if (ret) + goto out; + +#ifdef CONFIG_TRACER_MAX_TRACE + + if (!tr->allocated_snapshot) + goto out_max; + + ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order); + if (ret) { + /* Put back the old order */ + cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, old_order); + if (WARN_ON_ONCE(cnt)) { + /* + * AARGH! We are left with different orders! + * The max buffer is our "snapshot" buffer. + * When a tracer needs a snapshot (one of the + * latency tracers), it swaps the max buffer + * with the saved snap shot. We succeeded to + * update the order of the main buffer, but failed to + * update the order of the max buffer. But when we tried + * to reset the main buffer to the original size, we + * failed there too. This is very unlikely to + * happen, but if it does, warn and kill all + * tracing. + */ + tracing_disabled = 1; + } + goto out; + } + out_max: +#endif + (*ppos)++; + out: + if (ret) + cnt = ret; + tracing_start_tr(tr); + return cnt; +} + +static const struct file_operations buffer_subbuf_size_fops = { + .open = tracing_open_generic_tr, + .read = buffer_subbuf_size_read, + .write = buffer_subbuf_size_write, + .release = tracing_release_generic_tr, + .llseek = default_llseek, +}; + +static struct dentry *trace_instance_dir; static void -init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); +init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); -static void init_trace_buffers(struct trace_array *tr, struct trace_buffer *buf) +#ifdef CONFIG_MODULES +static int make_mod_delta(struct module *mod, void *data) { - int cpu; + struct trace_module_delta *module_delta; + struct trace_scratch *tscratch; + struct trace_mod_entry *entry; + struct trace_array *tr = data; + int i; - for_each_tracing_cpu(cpu) { - memset(per_cpu_ptr(buf->data, cpu), 0, sizeof(struct trace_array_cpu)); - per_cpu_ptr(buf->data, cpu)->trace_cpu.cpu = cpu; - per_cpu_ptr(buf->data, cpu)->trace_cpu.tr = tr; + tscratch = tr->scratch; + module_delta = READ_ONCE(tr->module_delta); + for (i = 0; i < tscratch->nr_entries; i++) { + entry = &tscratch->entries[i]; + if (strcmp(mod->name, entry->mod_name)) + continue; + if (mod->state == MODULE_STATE_GOING) + module_delta->delta[i] = 0; + else + module_delta->delta[i] = (unsigned long)mod->mem[MOD_TEXT].base + - entry->mod_addr; + break; + } + return 0; +} +#else +static int make_mod_delta(struct module *mod, void *data) +{ + return 0; +} +#endif + +static int mod_addr_comp(const void *a, const void *b, const void *data) +{ + const struct trace_mod_entry *e1 = a; + const struct trace_mod_entry *e2 = b; + + return e1->mod_addr > e2->mod_addr ? 1 : -1; +} + +static void setup_trace_scratch(struct trace_array *tr, + struct trace_scratch *tscratch, unsigned int size) +{ + struct trace_module_delta *module_delta; + struct trace_mod_entry *entry; + int i, nr_entries; + + if (!tscratch) + return; + + tr->scratch = tscratch; + tr->scratch_size = size; + + if (tscratch->text_addr) + tr->text_delta = (unsigned long)_text - tscratch->text_addr; + + if (struct_size(tscratch, entries, tscratch->nr_entries) > size) + goto reset; + + /* Check if each module name is a valid string */ + for (i = 0; i < tscratch->nr_entries; i++) { + int n; + + entry = &tscratch->entries[i]; + + for (n = 0; n < MODULE_NAME_LEN; n++) { + if (entry->mod_name[n] == '\0') + break; + if (!isprint(entry->mod_name[n])) + goto reset; + } + if (n == MODULE_NAME_LEN) + goto reset; + } + + /* Sort the entries so that we can find appropriate module from address. */ + nr_entries = tscratch->nr_entries; + sort_r(tscratch->entries, nr_entries, sizeof(struct trace_mod_entry), + mod_addr_comp, NULL, NULL); + + if (IS_ENABLED(CONFIG_MODULES)) { + module_delta = kzalloc(struct_size(module_delta, delta, nr_entries), GFP_KERNEL); + if (!module_delta) { + pr_info("module_delta allocation failed. Not able to decode module address."); + goto reset; + } + init_rcu_head(&module_delta->rcu); + } else + module_delta = NULL; + WRITE_ONCE(tr->module_delta, module_delta); + + /* Scan modules to make text delta for modules. */ + module_for_each_mod(make_mod_delta, tr); + + /* Set trace_clock as the same of the previous boot. */ + if (tscratch->clock_id != tr->clock_id) { + if (tscratch->clock_id >= ARRAY_SIZE(trace_clocks) || + tracing_set_clock(tr, trace_clocks[tscratch->clock_id].name) < 0) { + pr_info("the previous trace_clock info is not valid."); + goto reset; + } } + return; + reset: + /* Invalid trace modules */ + memset(tscratch, 0, size); } static int -allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) +allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size) { enum ring_buffer_flags rb_flags; + struct trace_scratch *tscratch; + unsigned int scratch_size = 0; + + rb_flags = tr->trace_flags & TRACE_ITER(OVERWRITE) ? RB_FL_OVERWRITE : 0; + + buf->tr = tr; - rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + if (tr->range_addr_start && tr->range_addr_size) { + /* Add scratch buffer to handle 128 modules */ + buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0, + tr->range_addr_start, + tr->range_addr_size, + struct_size(tscratch, entries, 128)); - buf->buffer = ring_buffer_alloc(size, rb_flags); + tscratch = ring_buffer_meta_scratch(buf->buffer, &scratch_size); + setup_trace_scratch(tr, tscratch, scratch_size); + + /* + * This is basically the same as a mapped buffer, + * with the same restrictions. + */ + tr->mapped++; + } else { + buf->buffer = ring_buffer_alloc(size, rb_flags); + } if (!buf->buffer) return -ENOMEM; buf->data = alloc_percpu(struct trace_array_cpu); if (!buf->data) { ring_buffer_free(buf->buffer); + buf->buffer = NULL; return -ENOMEM; } - init_trace_buffers(tr, buf); - /* Allocate the first page for all buffers */ - set_buffer_entries(&tr->trace_buffer, - ring_buffer_size(tr->trace_buffer.buffer, 0)); + set_buffer_entries(&tr->array_buffer, + ring_buffer_size(tr->array_buffer.buffer, 0)); return 0; } +static void free_trace_buffer(struct array_buffer *buf) +{ + if (buf->buffer) { + ring_buffer_free(buf->buffer); + buf->buffer = NULL; + free_percpu(buf->data); + buf->data = NULL; + } +} + static int allocate_trace_buffers(struct trace_array *tr, int size) { int ret; - ret = allocate_trace_buffer(tr, &tr->trace_buffer, size); + ret = allocate_trace_buffer(tr, &tr->array_buffer, size); if (ret) return ret; #ifdef CONFIG_TRACER_MAX_TRACE + /* Fix mapped buffer trace arrays do not have snapshot buffers */ + if (tr->range_addr_start) + return 0; + ret = allocate_trace_buffer(tr, &tr->max_buffer, allocate_snapshot ? size : 1); - if (WARN_ON(ret)) { - ring_buffer_free(tr->trace_buffer.buffer); - free_percpu(tr->trace_buffer.data); + if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) { + free_trace_buffer(&tr->array_buffer); return -ENOMEM; } tr->allocated_snapshot = allocate_snapshot; - /* - * Only the top level trace array gets its snapshot allocated - * from the kernel command line. - */ allocate_snapshot = false; #endif + return 0; } -static int new_instance_create(const char *name) +static void free_trace_buffers(struct trace_array *tr) +{ + if (!tr) + return; + + free_trace_buffer(&tr->array_buffer); + kfree(tr->module_delta); + +#ifdef CONFIG_TRACER_MAX_TRACE + free_trace_buffer(&tr->max_buffer); +#endif +} + +static void init_trace_flags_index(struct trace_array *tr) +{ + int i; + + /* Used by the trace options files */ + for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) + tr->trace_flags_index[i] = i; +} + +static int __update_tracer(struct trace_array *tr) +{ + struct tracer *t; + int ret = 0; + + for (t = trace_types; t && !ret; t = t->next) + ret = add_tracer(tr, t); + + return ret; +} + +static __init int __update_tracer_options(struct trace_array *tr) +{ + struct tracers *t; + int ret = 0; + + list_for_each_entry(t, &tr->tracers, list) { + ret = add_tracer_options(tr, t); + if (ret < 0) + break; + } + + return ret; +} + +static __init void update_tracer_options(void) { struct trace_array *tr; - int ret; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); + tracer_options_updated = true; + list_for_each_entry(tr, &ftrace_trace_arrays, list) + __update_tracer_options(tr); +} + +/* Must have trace_types_lock held */ +struct trace_array *trace_array_find(const char *instance) +{ + struct trace_array *tr, *found = NULL; - ret = -EEXIST; list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (tr->name && strcmp(tr->name, name) == 0) - goto out_unlock; + if (tr->name && strcmp(tr->name, instance) == 0) { + found = tr; + break; + } } + return found; +} + +struct trace_array *trace_array_find_get(const char *instance) +{ + struct trace_array *tr; + + guard(mutex)(&trace_types_lock); + tr = trace_array_find(instance); + if (tr) + tr->ref++; + + return tr; +} + +static int trace_array_create_dir(struct trace_array *tr) +{ + int ret; + + tr->dir = tracefs_create_dir(tr->name, trace_instance_dir); + if (!tr->dir) + return -EINVAL; + + ret = event_trace_add_tracer(tr->dir, tr); + if (ret) { + tracefs_remove(tr->dir); + return ret; + } + + init_tracer_tracefs(tr, tr->dir); + ret = __update_tracer(tr); + if (ret) { + event_trace_del_tracer(tr); + tracefs_remove(tr->dir); + return ret; + } + return 0; +} + +static struct trace_array * +trace_array_create_systems(const char *name, const char *systems, + unsigned long range_addr_start, + unsigned long range_addr_size) +{ + struct trace_array *tr; + int ret; + ret = -ENOMEM; tr = kzalloc(sizeof(*tr), GFP_KERNEL); if (!tr) - goto out_unlock; + return ERR_PTR(ret); tr->name = kstrdup(name, GFP_KERNEL); if (!tr->name) goto out_free_tr; + if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) + goto out_free_tr; + + if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL)) + goto out_free_tr; + + if (systems) { + tr->system_names = kstrdup_const(systems, GFP_KERNEL); + if (!tr->system_names) + goto out_free_tr; + } + + /* Only for boot up memory mapped ring buffers */ + tr->range_addr_start = range_addr_start; + tr->range_addr_size = range_addr_size; + + tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; + + cpumask_copy(tr->tracing_cpumask, cpu_all_mask); + raw_spin_lock_init(&tr->start_lock); + tr->syscall_buf_sz = global_trace.syscall_buf_sz; + + tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +#ifdef CONFIG_TRACER_MAX_TRACE + spin_lock_init(&tr->snapshot_trigger_lock); +#endif tr->current_trace = &nop_trace; + tr->current_trace_flags = nop_trace.flags; INIT_LIST_HEAD(&tr->systems); INIT_LIST_HEAD(&tr->events); + INIT_LIST_HEAD(&tr->hist_vars); + INIT_LIST_HEAD(&tr->err_log); + INIT_LIST_HEAD(&tr->tracers); + INIT_LIST_HEAD(&tr->marker_list); + +#ifdef CONFIG_MODULES + INIT_LIST_HEAD(&tr->mod_events); +#endif if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; - /* Holder for file callbacks */ - tr->trace_cpu.cpu = RING_BUFFER_ALL_CPUS; - tr->trace_cpu.tr = tr; + /* The ring buffer is defaultly expanded */ + trace_set_ring_buffer_expanded(tr); - tr->dir = debugfs_create_dir(name, trace_instance_dir); - if (!tr->dir) + if (ftrace_allocate_ftrace_ops(tr) < 0) goto out_free_tr; - ret = event_trace_add_tracer(tr->dir, tr); - if (ret) - goto out_free_tr; + ftrace_init_trace_array(tr); + + init_trace_flags_index(tr); - init_tracer_debugfs(tr, tr->dir); + if (trace_instance_dir) { + ret = trace_array_create_dir(tr); + if (ret) + goto out_free_tr; + } else + __trace_early_add_events(tr); list_add(&tr->list, &ftrace_trace_arrays); - mutex_unlock(&trace_types_lock); + tr->ref++; - return 0; + return tr; out_free_tr: - if (tr->trace_buffer.buffer) - ring_buffer_free(tr->trace_buffer.buffer); + ftrace_free_ftrace_ops(tr); + free_trace_buffers(tr); + free_cpumask_var(tr->pipe_cpumask); + free_cpumask_var(tr->tracing_cpumask); + kfree_const(tr->system_names); + kfree(tr->range_name); kfree(tr->name); kfree(tr); - out_unlock: - mutex_unlock(&trace_types_lock); + return ERR_PTR(ret); +} + +static struct trace_array *trace_array_create(const char *name) +{ + return trace_array_create_systems(name, NULL, 0, 0); +} + +static int instance_mkdir(const char *name) +{ + struct trace_array *tr; + int ret; + + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); + + ret = -EEXIST; + if (trace_array_find(name)) + return -EEXIST; + + tr = trace_array_create(name); + + ret = PTR_ERR_OR_ZERO(tr); return ret; +} + +#ifdef CONFIG_MMU +static u64 map_pages(unsigned long start, unsigned long size) +{ + unsigned long vmap_start, vmap_end; + struct vm_struct *area; + int ret; + area = get_vm_area(size, VM_IOREMAP); + if (!area) + return 0; + + vmap_start = (unsigned long) area->addr; + vmap_end = vmap_start + size; + + ret = vmap_page_range(vmap_start, vmap_end, + start, pgprot_nx(PAGE_KERNEL)); + if (ret < 0) { + free_vm_area(area); + return 0; + } + + return (u64)vmap_start; } +#else +static inline u64 map_pages(unsigned long start, unsigned long size) +{ + return 0; +} +#endif -static int instance_delete(const char *name) +/** + * trace_array_get_by_name - Create/Lookup a trace array, given its name. + * @name: The name of the trace array to be looked up/created. + * @systems: A list of systems to create event directories for (NULL for all) + * + * Returns pointer to trace array with given name. + * NULL, if it cannot be created. + * + * NOTE: This function increments the reference counter associated with the + * trace array returned. This makes sure it cannot be freed while in use. + * Use trace_array_put() once the trace array is no longer needed. + * If the trace_array is to be freed, trace_array_destroy() needs to + * be called after the trace_array_put(), or simply let user space delete + * it from the tracefs instances directory. But until the + * trace_array_put() is called, user space can not delete it. + * + */ +struct trace_array *trace_array_get_by_name(const char *name, const char *systems) { struct trace_array *tr; - int found = 0; - int ret; - mutex_lock(&trace_types_lock); + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); - ret = -ENODEV; list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (tr->name && strcmp(tr->name, name) == 0) { - found = 1; - break; + tr->ref++; + return tr; } } - if (!found) - goto out_unlock; - ret = -EBUSY; - if (tr->ref) - goto out_unlock; + tr = trace_array_create_systems(name, systems, 0, 0); + + if (IS_ERR(tr)) + tr = NULL; + else + tr->ref++; + + return tr; +} +EXPORT_SYMBOL_GPL(trace_array_get_by_name); + +static int __remove_instance(struct trace_array *tr) +{ + int i; + + /* Reference counter for a newly created trace array = 1. */ + if (tr->ref > 1 || (tr->current_trace && tr->trace_ref)) + return -EBUSY; list_del(&tr->list); + /* Disable all the flags that were enabled coming in */ + for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) { + if ((1 << i) & ZEROED_TRACE_FLAGS) + set_tracer_flag(tr, 1ULL << i, 0); + } + + if (printk_trace == tr) + update_printk_trace(&global_trace); + + if (update_marker_trace(tr, 0)) + synchronize_rcu(); + + tracing_set_nop(tr); + clear_ftrace_function_probes(tr); event_trace_del_tracer(tr); - debugfs_remove_recursive(tr->dir); - free_percpu(tr->trace_buffer.data); - ring_buffer_free(tr->trace_buffer.buffer); + ftrace_clear_pids(tr); + ftrace_destroy_function_files(tr); + tracefs_remove(tr->dir); + free_percpu(tr->last_func_repeats); + free_trace_buffers(tr); + clear_tracing_err_log(tr); + free_tracers(tr); + + if (tr->range_name) { + reserve_mem_release_by_name(tr->range_name); + kfree(tr->range_name); + } + if (tr->flags & TRACE_ARRAY_FL_VMALLOC) + vfree((void *)tr->range_addr_start); + + for (i = 0; i < tr->nr_topts; i++) { + kfree(tr->topts[i].topts); + } + kfree(tr->topts); + free_cpumask_var(tr->pipe_cpumask); + free_cpumask_var(tr->tracing_cpumask); + kfree_const(tr->system_names); kfree(tr->name); kfree(tr); - ret = 0; - - out_unlock: - mutex_unlock(&trace_types_lock); - - return ret; + return 0; } -static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode) +int trace_array_destroy(struct trace_array *this_tr) { - struct dentry *parent; - int ret; + struct trace_array *tr; - /* Paranoid: Make sure the parent is the "instances" directory */ - parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); - if (WARN_ON_ONCE(parent != trace_instance_dir)) - return -ENOENT; + if (!this_tr) + return -EINVAL; - /* - * The inode mutex is locked, but debugfs_create_dir() will also - * take the mutex. As the instances directory can not be destroyed - * or changed in any other way, it is safe to unlock it, and - * let the dentry try. If two users try to make the same dir at - * the same time, then the new_instance_create() will determine the - * winner. - */ - mutex_unlock(&inode->i_mutex); + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); - ret = new_instance_create(dentry->d_iname); - mutex_lock(&inode->i_mutex); + /* Making sure trace array exists before destroying it. */ + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr == this_tr) + return __remove_instance(tr); + } - return ret; + return -ENODEV; } +EXPORT_SYMBOL_GPL(trace_array_destroy); -static int instance_rmdir(struct inode *inode, struct dentry *dentry) +static int instance_rmdir(const char *name) { - struct dentry *parent; - int ret; - - /* Paranoid: Make sure the parent is the "instances" directory */ - parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); - if (WARN_ON_ONCE(parent != trace_instance_dir)) - return -ENOENT; - - /* The caller did a dget() on dentry */ - mutex_unlock(&dentry->d_inode->i_mutex); - - /* - * The inode mutex is locked, but debugfs_create_dir() will also - * take the mutex. As the instances directory can not be destroyed - * or changed in any other way, it is safe to unlock it, and - * let the dentry try. If two users try to make the same dir at - * the same time, then the instance_delete() will determine the - * winner. - */ - mutex_unlock(&inode->i_mutex); + struct trace_array *tr; - ret = instance_delete(dentry->d_iname); + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); - mutex_lock(&dentry->d_inode->i_mutex); + tr = trace_array_find(name); + if (!tr) + return -ENODEV; - return ret; + return __remove_instance(tr); } -static const struct inode_operations instance_dir_inode_operations = { - .lookup = simple_lookup, - .mkdir = instance_mkdir, - .rmdir = instance_rmdir, -}; - static __init void create_trace_instances(struct dentry *d_tracer) { - trace_instance_dir = debugfs_create_dir("instances", d_tracer); - if (WARN_ON(!trace_instance_dir)) + struct trace_array *tr; + + trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer, + instance_mkdir, + instance_rmdir); + if (MEM_FAIL(!trace_instance_dir, "Failed to create instances directory\n")) return; - /* Hijack the dir inode operations, to allow mkdir */ - trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations; + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!tr->name) + continue; + if (MEM_FAIL(trace_array_create_dir(tr) < 0, + "Failed to create instance directory\n")) + return; + } } static void -init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) +init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) { int cpu; - trace_create_file("trace_options", 0644, d_tracer, + trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer, + tr, &show_traces_fops); + + trace_create_file("current_tracer", TRACE_MODE_WRITE, d_tracer, + tr, &set_tracer_fops); + + trace_create_file("tracing_cpumask", TRACE_MODE_WRITE, d_tracer, + tr, &tracing_cpumask_fops); + + trace_create_file("trace_options", TRACE_MODE_WRITE, d_tracer, tr, &tracing_iter_fops); - trace_create_file("trace", 0644, d_tracer, - (void *)&tr->trace_cpu, &tracing_fops); + trace_create_file("trace", TRACE_MODE_WRITE, d_tracer, + tr, &tracing_fops); - trace_create_file("trace_pipe", 0444, d_tracer, - (void *)&tr->trace_cpu, &tracing_pipe_fops); + trace_create_file("trace_pipe", TRACE_MODE_READ, d_tracer, + tr, &tracing_pipe_fops); - trace_create_file("buffer_size_kb", 0644, d_tracer, - (void *)&tr->trace_cpu, &tracing_entries_fops); + trace_create_file("buffer_size_kb", TRACE_MODE_WRITE, d_tracer, + tr, &tracing_entries_fops); - trace_create_file("buffer_total_size_kb", 0444, d_tracer, + trace_create_file("buffer_total_size_kb", TRACE_MODE_READ, d_tracer, tr, &tracing_total_entries_fops); trace_create_file("free_buffer", 0200, d_tracer, @@ -6137,104 +10641,379 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("trace_marker", 0220, d_tracer, tr, &tracing_mark_fops); - trace_create_file("trace_clock", 0644, d_tracer, tr, + tr->trace_marker_file = __find_event_file(tr, "ftrace", "print"); + + trace_create_file("trace_marker_raw", 0220, d_tracer, + tr, &tracing_mark_raw_fops); + + trace_create_file("trace_clock", TRACE_MODE_WRITE, d_tracer, tr, &trace_clock_fops); - trace_create_file("tracing_on", 0644, d_tracer, - tr, &rb_simple_fops); + trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer, + tr, &rb_simple_fops); + + trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr, + &trace_time_stamp_mode_fops); + + tr->buffer_percent = 50; + + trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer, + tr, &buffer_percent_fops); + + trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer, + tr, &buffer_subbuf_size_fops); + + trace_create_file("syscall_user_buf_size", TRACE_MODE_WRITE, d_tracer, + tr, &tracing_syscall_buf_fops); + + create_trace_options_dir(tr); + +#ifdef CONFIG_TRACER_MAX_TRACE + trace_create_maxlat_file(tr, d_tracer); +#endif + + if (ftrace_create_function_files(tr, d_tracer)) + MEM_FAIL(1, "Could not allocate function filter files"); + if (tr->range_addr_start) { + trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer, + tr, &last_boot_fops); #ifdef CONFIG_TRACER_SNAPSHOT - trace_create_file("snapshot", 0644, d_tracer, - (void *)&tr->trace_cpu, &snapshot_fops); + } else { + trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer, + tr, &snapshot_fops); #endif + } + + trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer, + tr, &tracing_err_log_fops); for_each_tracing_cpu(cpu) - tracing_init_debugfs_percpu(tr, cpu); + tracing_init_tracefs_percpu(tr, cpu); + ftrace_init_tracefs(tr, d_tracer); } -static __init int tracer_init_debugfs(void) +#ifdef CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED +static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) { - struct dentry *d_tracer; + struct vfsmount *mnt; + struct file_system_type *type; + struct fs_context *fc; + int ret; - trace_access_lock_init(); + /* + * To maintain backward compatibility for tools that mount + * debugfs to get to the tracing facility, tracefs is automatically + * mounted to the debugfs/tracing directory. + */ + type = get_fs_type("tracefs"); + if (!type) + return NULL; - d_tracer = tracing_init_dentry(); - if (!d_tracer) - return 0; + fc = fs_context_for_submount(type, mntpt); + put_filesystem(type); + if (IS_ERR(fc)) + return ERR_CAST(fc); + + pr_warn("NOTICE: Automounting of tracing to debugfs is deprecated and will be removed in 2030\n"); + + ret = vfs_parse_fs_string(fc, "source", "tracefs"); + if (!ret) + mnt = fc_mount(fc); + else + mnt = ERR_PTR(ret); - init_tracer_debugfs(&global_trace, d_tracer); + put_fs_context(fc); + return mnt; +} +#endif - trace_create_file("tracing_cpumask", 0644, d_tracer, - &global_trace, &tracing_cpumask_fops); +/** + * tracing_init_dentry - initialize top level trace array + * + * This is called when creating files or directories in the tracing + * directory. It is called via fs_initcall() by any of the boot up code + * and expects to return the dentry of the top level tracing directory. + */ +int tracing_init_dentry(void) +{ + struct trace_array *tr = &global_trace; - trace_create_file("available_tracers", 0444, d_tracer, - &global_trace, &show_traces_fops); + if (security_locked_down(LOCKDOWN_TRACEFS)) { + pr_warn("Tracing disabled due to lockdown\n"); + return -EPERM; + } - trace_create_file("current_tracer", 0644, d_tracer, - &global_trace, &set_tracer_fops); + /* The top level trace array uses NULL as parent */ + if (tr->dir) + return 0; -#ifdef CONFIG_TRACER_MAX_TRACE - trace_create_file("tracing_max_latency", 0644, d_tracer, - &tracing_max_latency, &tracing_max_lat_fops); + if (WARN_ON(!tracefs_initialized())) + return -ENODEV; + +#ifdef CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED + /* + * As there may still be users that expect the tracing + * files to exist in debugfs/tracing, we must automount + * the tracefs file system there, so older tools still + * work with the newer kernel. + */ + tr->dir = debugfs_create_automount("tracing", NULL, + trace_automount, NULL); #endif - trace_create_file("tracing_thresh", 0644, d_tracer, - &tracing_thresh, &tracing_max_lat_fops); + return 0; +} - trace_create_file("README", 0444, d_tracer, - NULL, &tracing_readme_fops); +extern struct trace_eval_map *__start_ftrace_eval_maps[]; +extern struct trace_eval_map *__stop_ftrace_eval_maps[]; - trace_create_file("saved_cmdlines", 0444, d_tracer, - NULL, &tracing_saved_cmdlines_fops); +static struct workqueue_struct *eval_map_wq __initdata; +static struct work_struct eval_map_work __initdata; +static struct work_struct tracerfs_init_work __initdata; -#ifdef CONFIG_DYNAMIC_FTRACE - trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, - &ftrace_update_tot_cnt, &tracing_dyn_info_fops); -#endif +static void __init eval_map_work_func(struct work_struct *work) +{ + int len; + + len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps; + trace_event_update_with_eval_map(NULL, __start_ftrace_eval_maps, len); +} + +static int __init trace_eval_init(void) +{ + INIT_WORK(&eval_map_work, eval_map_work_func); + + eval_map_wq = alloc_workqueue("eval_map_wq", WQ_UNBOUND, 0); + if (!eval_map_wq) { + pr_err("Unable to allocate eval_map_wq\n"); + /* Do work here */ + eval_map_work_func(&eval_map_work); + return -ENOMEM; + } - create_trace_instances(d_tracer); + queue_work(eval_map_wq, &eval_map_work); + return 0; +} - create_trace_options_dir(&global_trace); +subsys_initcall(trace_eval_init); +static int __init trace_eval_sync(void) +{ + /* Make sure the eval map updates are finished */ + if (eval_map_wq) + destroy_workqueue(eval_map_wq); return 0; } -static int trace_panic_handler(struct notifier_block *this, - unsigned long event, void *unused) +late_initcall_sync(trace_eval_sync); + + +#ifdef CONFIG_MODULES + +bool module_exists(const char *module) { - if (ftrace_dump_on_oops) - ftrace_dump(ftrace_dump_on_oops); - return NOTIFY_OK; + /* All modules have the symbol __this_module */ + static const char this_mod[] = "__this_module"; + char modname[MODULE_NAME_LEN + sizeof(this_mod) + 2]; + unsigned long val; + int n; + + n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod); + + if (n > sizeof(modname) - 1) + return false; + + val = module_kallsyms_lookup_name(modname); + return val != 0; } -static struct notifier_block trace_panic_notifier = { - .notifier_call = trace_panic_handler, - .next = NULL, - .priority = 150 /* priority: INT_MAX >= x >= 0 */ -}; +static void trace_module_add_evals(struct module *mod) +{ + /* + * Modules with bad taint do not have events created, do + * not bother with enums either. + */ + if (trace_module_has_bad_taint(mod)) + return; + + /* Even if no trace_evals, this need to sanitize field types. */ + trace_event_update_with_eval_map(mod, mod->trace_evals, mod->num_trace_evals); +} + +#ifdef CONFIG_TRACE_EVAL_MAP_FILE +static void trace_module_remove_evals(struct module *mod) +{ + union trace_eval_map_item *map; + union trace_eval_map_item **last = &trace_eval_maps; + + if (!mod->num_trace_evals) + return; + + guard(mutex)(&trace_eval_mutex); + + map = trace_eval_maps; + + while (map) { + if (map->head.mod == mod) + break; + map = trace_eval_jmp_to_tail(map); + last = &map->tail.next; + map = map->tail.next; + } + if (!map) + return; + + *last = trace_eval_jmp_to_tail(map)->tail.next; + kfree(map); +} +#else +static inline void trace_module_remove_evals(struct module *mod) { } +#endif /* CONFIG_TRACE_EVAL_MAP_FILE */ -static int trace_die_handler(struct notifier_block *self, - unsigned long val, - void *data) +static void trace_module_record(struct module *mod, bool add) { + struct trace_array *tr; + unsigned long flags; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + flags = tr->flags & (TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT); + /* Update any persistent trace array that has already been started */ + if (flags == TRACE_ARRAY_FL_BOOT && add) { + guard(mutex)(&scratch_mutex); + save_mod(mod, tr); + } else if (flags & TRACE_ARRAY_FL_LAST_BOOT) { + /* Update delta if the module loaded in previous boot */ + make_mod_delta(mod, tr); + } + } +} + +static int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + switch (val) { - case DIE_OOPS: - if (ftrace_dump_on_oops) - ftrace_dump(ftrace_dump_on_oops); + case MODULE_STATE_COMING: + trace_module_add_evals(mod); + trace_module_record(mod, true); break; - default: + case MODULE_STATE_GOING: + trace_module_remove_evals(mod); + trace_module_record(mod, false); break; } + return NOTIFY_OK; } +static struct notifier_block trace_module_nb = { + .notifier_call = trace_module_notify, + .priority = 0, +}; +#endif /* CONFIG_MODULES */ + +static __init void tracer_init_tracefs_work_func(struct work_struct *work) +{ + + event_trace_init(); + + init_tracer_tracefs(&global_trace, NULL); + ftrace_init_tracefs_toplevel(&global_trace, NULL); + + trace_create_file("tracing_thresh", TRACE_MODE_WRITE, NULL, + &global_trace, &tracing_thresh_fops); + + trace_create_file("README", TRACE_MODE_READ, NULL, + NULL, &tracing_readme_fops); + + trace_create_file("saved_cmdlines", TRACE_MODE_READ, NULL, + NULL, &tracing_saved_cmdlines_fops); + + trace_create_file("saved_cmdlines_size", TRACE_MODE_WRITE, NULL, + NULL, &tracing_saved_cmdlines_size_fops); + + trace_create_file("saved_tgids", TRACE_MODE_READ, NULL, + NULL, &tracing_saved_tgids_fops); + + trace_create_eval_file(NULL); + +#ifdef CONFIG_MODULES + register_module_notifier(&trace_module_nb); +#endif + +#ifdef CONFIG_DYNAMIC_FTRACE + trace_create_file("dyn_ftrace_total_info", TRACE_MODE_READ, NULL, + NULL, &tracing_dyn_info_fops); +#endif + + create_trace_instances(NULL); + + update_tracer_options(); +} + +static __init int tracer_init_tracefs(void) +{ + int ret; + + trace_access_lock_init(); + + ret = tracing_init_dentry(); + if (ret) + return 0; + + if (eval_map_wq) { + INIT_WORK(&tracerfs_init_work, tracer_init_tracefs_work_func); + queue_work(eval_map_wq, &tracerfs_init_work); + } else { + tracer_init_tracefs_work_func(NULL); + } + + if (rv_init_interface()) + pr_err("RV: Error while creating the RV interface\n"); + + return 0; +} + +fs_initcall(tracer_init_tracefs); + +static int trace_die_panic_handler(struct notifier_block *self, + unsigned long ev, void *unused); + +static struct notifier_block trace_panic_notifier = { + .notifier_call = trace_die_panic_handler, + .priority = INT_MAX - 1, +}; + static struct notifier_block trace_die_notifier = { - .notifier_call = trace_die_handler, - .priority = 200 + .notifier_call = trace_die_panic_handler, + .priority = INT_MAX - 1, }; /* + * The idea is to execute the following die/panic callback early, in order + * to avoid showing irrelevant information in the trace (like other panic + * notifier functions); we are the 2nd to run, after hung_task/rcu_stall + * warnings get disabled (to prevent potential log flooding). + */ +static int trace_die_panic_handler(struct notifier_block *self, + unsigned long ev, void *unused) +{ + if (!ftrace_dump_on_oops_enabled()) + return NOTIFY_DONE; + + /* The die notifier requires DIE_OOPS to trigger */ + if (self == &trace_die_notifier && ev != DIE_OOPS) + return NOTIFY_DONE; + + ftrace_dump(DUMP_PARAM); + + return NOTIFY_DONE; +} + +/* * printk is set to max of 1024, we really don't need it that big. * Nothing should be printing 1000 characters anyway. */ @@ -6251,39 +11030,62 @@ void trace_printk_seq(struct trace_seq *s) { /* Probably should print a warning here. */ - if (s->len >= TRACE_MAX_PRINT) - s->len = TRACE_MAX_PRINT; + if (s->seq.len >= TRACE_MAX_PRINT) + s->seq.len = TRACE_MAX_PRINT; + + /* + * More paranoid code. Although the buffer size is set to + * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just + * an extra layer of protection. + */ + if (WARN_ON_ONCE(s->seq.len >= s->seq.size)) + s->seq.len = s->seq.size - 1; /* should be zero ended, but we are paranoid. */ - s->buffer[s->len] = 0; + s->buffer[s->seq.len] = 0; printk(KERN_TRACE "%s", s->buffer); trace_seq_init(s); } -void trace_init_global_iter(struct trace_iterator *iter) +static void trace_init_iter(struct trace_iterator *iter, struct trace_array *tr) { - iter->tr = &global_trace; + iter->tr = tr; iter->trace = iter->tr->current_trace; iter->cpu_file = RING_BUFFER_ALL_CPUS; - iter->trace_buffer = &global_trace.trace_buffer; + iter->array_buffer = &tr->array_buffer; + + if (iter->trace && iter->trace->open) + iter->trace->open(iter); + + /* Annotate start of buffers if we had overruns */ + if (ring_buffer_overruns(iter->array_buffer->buffer)) + iter->iter_flags |= TRACE_FILE_ANNOTATE; + + /* Output in nanoseconds only if we are using a clock in nanoseconds. */ + if (trace_clocks[iter->tr->clock_id].in_ns) + iter->iter_flags |= TRACE_FILE_TIME_IN_NS; + + /* Can not use kmalloc for iter.temp and iter.fmt */ + iter->temp = static_temp_buf; + iter->temp_size = STATIC_TEMP_BUF_SIZE; + iter->fmt = static_fmt_buf; + iter->fmt_size = STATIC_FMT_BUF_SIZE; } -void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) +void trace_init_global_iter(struct trace_iterator *iter) +{ + trace_init_iter(iter, &global_trace); +} + +static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_mode) { /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; - static atomic_t dump_running; unsigned int old_userobj; unsigned long flags; - int cnt = 0, cpu; - - /* Only allow one dump user at a time. */ - if (atomic_inc_return(&dump_running) != 1) { - atomic_dec(&dump_running); - return; - } + int cnt = 0; /* * Always turn off tracing when we dump. @@ -6293,37 +11095,30 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) * If the user does a sysrq-z, then they can re-enable * tracing with echo 1 > tracing_on. */ - tracing_off(); + tracer_tracing_off(tr); local_irq_save(flags); /* Simulate the iterator */ - trace_init_global_iter(&iter); + trace_init_iter(&iter, tr); - for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled); - } + /* While dumping, do not allow the buffer to be enable */ + tracer_tracing_disable(tr); - old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; + old_userobj = tr->trace_flags & TRACE_ITER(SYM_USEROBJ); /* don't look at user memory in panic mode */ - trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + tr->trace_flags &= ~TRACE_ITER(SYM_USEROBJ); - switch (oops_dump_mode) { - case DUMP_ALL: - iter.cpu_file = RING_BUFFER_ALL_CPUS; - break; - case DUMP_ORIG: + if (dump_mode == DUMP_ORIG) iter.cpu_file = raw_smp_processor_id(); - break; - case DUMP_NONE: - goto out_enable; - default: - printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); + else iter.cpu_file = RING_BUFFER_ALL_CPUS; - } - printk(KERN_TRACE "Dumping ftrace buffer:\n"); + if (tr == &global_trace) + printk(KERN_TRACE "Dumping ftrace buffer:\n"); + else + printk(KERN_TRACE "Dumping ftrace instance %s buffer:\n", tr->name); /* Did function tracer already get disabled? */ if (ftrace_is_dead()) { @@ -6332,7 +11127,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) } /* - * We need to stop all tracing on all CPUS to read the + * We need to stop all tracing on all CPUS to read * the next buffer. This is a bit expensive, but is * not done often. We fill all what we can read, * and then release the locks again. @@ -6345,12 +11140,8 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) cnt++; - /* reset all but tr, trace, and overruns */ - memset(&iter.seq, 0, - sizeof(struct trace_iterator) - - offsetof(struct trace_iterator, seq)); + trace_iterator_reset(&iter); iter.iter_flags |= TRACE_FILE_LAT_FMT; - iter.pos = -1; if (trace_find_next_entry_inc(&iter) != NULL) { int ret; @@ -6358,10 +11149,10 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) ret = print_trace_line(&iter); if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(&iter); + + trace_printk_seq(&iter.seq); } touch_nmi_watchdog(); - - trace_printk_seq(&iter.seq); } if (!cnt) @@ -6369,56 +11160,452 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) else printk(KERN_TRACE "---------------------------------\n"); - out_enable: - trace_flags |= old_userobj; + tr->trace_flags |= old_userobj; - for_each_tracing_cpu(cpu) { - atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); - } - atomic_dec(&dump_running); + tracer_tracing_enable(tr); local_irq_restore(flags); } + +static void ftrace_dump_by_param(void) +{ + bool first_param = true; + char dump_param[MAX_TRACER_SIZE]; + char *buf, *token, *inst_name; + struct trace_array *tr; + + strscpy(dump_param, ftrace_dump_on_oops, MAX_TRACER_SIZE); + buf = dump_param; + + while ((token = strsep(&buf, ",")) != NULL) { + if (first_param) { + first_param = false; + if (!strcmp("0", token)) + continue; + else if (!strcmp("1", token)) { + ftrace_dump_one(&global_trace, DUMP_ALL); + continue; + } + else if (!strcmp("2", token) || + !strcmp("orig_cpu", token)) { + ftrace_dump_one(&global_trace, DUMP_ORIG); + continue; + } + } + + inst_name = strsep(&token, "="); + tr = trace_array_find(inst_name); + if (!tr) { + printk(KERN_TRACE "Instance %s not found\n", inst_name); + continue; + } + + if (token && (!strcmp("2", token) || + !strcmp("orig_cpu", token))) + ftrace_dump_one(tr, DUMP_ORIG); + else + ftrace_dump_one(tr, DUMP_ALL); + } +} + +void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) +{ + static atomic_t dump_running; + + /* Only allow one dump user at a time. */ + if (atomic_inc_return(&dump_running) != 1) { + atomic_dec(&dump_running); + return; + } + + switch (oops_dump_mode) { + case DUMP_ALL: + ftrace_dump_one(&global_trace, DUMP_ALL); + break; + case DUMP_ORIG: + ftrace_dump_one(&global_trace, DUMP_ORIG); + break; + case DUMP_PARAM: + ftrace_dump_by_param(); + break; + case DUMP_NONE: + break; + default: + printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); + ftrace_dump_one(&global_trace, DUMP_ALL); + } + + atomic_dec(&dump_running); +} EXPORT_SYMBOL_GPL(ftrace_dump); +#define WRITE_BUFSIZE 4096 + +ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos, + int (*createfn)(const char *)) +{ + char *kbuf __free(kfree) = NULL; + char *buf, *tmp; + int ret = 0; + size_t done = 0; + size_t size; + + kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + while (done < count) { + size = count - done; + + if (size >= WRITE_BUFSIZE) + size = WRITE_BUFSIZE - 1; + + if (copy_from_user(kbuf, buffer + done, size)) + return -EFAULT; + + kbuf[size] = '\0'; + buf = kbuf; + do { + tmp = strchr(buf, '\n'); + if (tmp) { + *tmp = '\0'; + size = tmp - buf + 1; + } else { + size = strlen(buf); + if (done + size < count) { + if (buf != kbuf) + break; + /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */ + pr_warn("Line length is too long: Should be less than %d\n", + WRITE_BUFSIZE - 2); + return -EINVAL; + } + } + done += size; + + /* Remove comments */ + tmp = strchr(buf, '#'); + + if (tmp) + *tmp = '\0'; + + ret = createfn(buf); + if (ret) + return ret; + buf += size; + + } while (done < count); + } + return done; +} + +#ifdef CONFIG_TRACER_MAX_TRACE +__init static bool tr_needs_alloc_snapshot(const char *name) +{ + char *test; + int len = strlen(name); + bool ret; + + if (!boot_snapshot_index) + return false; + + if (strncmp(name, boot_snapshot_info, len) == 0 && + boot_snapshot_info[len] == '\t') + return true; + + test = kmalloc(strlen(name) + 3, GFP_KERNEL); + if (!test) + return false; + + sprintf(test, "\t%s\t", name); + ret = strstr(boot_snapshot_info, test) == NULL; + kfree(test); + return ret; +} + +__init static void do_allocate_snapshot(const char *name) +{ + if (!tr_needs_alloc_snapshot(name)) + return; + + /* + * When allocate_snapshot is set, the next call to + * allocate_trace_buffers() (called by trace_array_get_by_name()) + * will allocate the snapshot buffer. That will also clear + * this flag. + */ + allocate_snapshot = true; +} +#else +static inline void do_allocate_snapshot(const char *name) { } +#endif + +__init static int backup_instance_area(const char *backup, + unsigned long *addr, phys_addr_t *size) +{ + struct trace_array *backup_tr; + void *allocated_vaddr = NULL; + + backup_tr = trace_array_get_by_name(backup, NULL); + if (!backup_tr) { + pr_warn("Tracing: Instance %s is not found.\n", backup); + return -ENOENT; + } + + if (!(backup_tr->flags & TRACE_ARRAY_FL_BOOT)) { + pr_warn("Tracing: Instance %s is not boot mapped.\n", backup); + trace_array_put(backup_tr); + return -EINVAL; + } + + *size = backup_tr->range_addr_size; + + allocated_vaddr = vzalloc(*size); + if (!allocated_vaddr) { + pr_warn("Tracing: Failed to allocate memory for copying instance %s (size 0x%lx)\n", + backup, (unsigned long)*size); + trace_array_put(backup_tr); + return -ENOMEM; + } + + memcpy(allocated_vaddr, + (void *)backup_tr->range_addr_start, (size_t)*size); + *addr = (unsigned long)allocated_vaddr; + + trace_array_put(backup_tr); + return 0; +} + +__init static void enable_instances(void) +{ + struct trace_array *tr; + bool memmap_area = false; + char *curr_str; + char *name; + char *str; + char *tok; + + /* A tab is always appended */ + boot_instance_info[boot_instance_index - 1] = '\0'; + str = boot_instance_info; + + while ((curr_str = strsep(&str, "\t"))) { + phys_addr_t start = 0; + phys_addr_t size = 0; + unsigned long addr = 0; + bool traceprintk = false; + bool traceoff = false; + char *flag_delim; + char *addr_delim; + char *rname __free(kfree) = NULL; + char *backup; + + tok = strsep(&curr_str, ","); + + name = strsep(&tok, "="); + backup = tok; + + flag_delim = strchr(name, '^'); + addr_delim = strchr(name, '@'); + + if (addr_delim) + *addr_delim++ = '\0'; + + if (flag_delim) + *flag_delim++ = '\0'; + + if (backup) { + if (backup_instance_area(backup, &addr, &size) < 0) + continue; + } + + if (flag_delim) { + char *flag; + + while ((flag = strsep(&flag_delim, "^"))) { + if (strcmp(flag, "traceoff") == 0) { + traceoff = true; + } else if ((strcmp(flag, "printk") == 0) || + (strcmp(flag, "traceprintk") == 0) || + (strcmp(flag, "trace_printk") == 0)) { + traceprintk = true; + } else { + pr_info("Tracing: Invalid instance flag '%s' for %s\n", + flag, name); + } + } + } + + tok = addr_delim; + if (tok && isdigit(*tok)) { + start = memparse(tok, &tok); + if (!start) { + pr_warn("Tracing: Invalid boot instance address for %s\n", + name); + continue; + } + if (*tok != ':') { + pr_warn("Tracing: No size specified for instance %s\n", name); + continue; + } + tok++; + size = memparse(tok, &tok); + if (!size) { + pr_warn("Tracing: Invalid boot instance size for %s\n", + name); + continue; + } + memmap_area = true; + } else if (tok) { + if (!reserve_mem_find_by_name(tok, &start, &size)) { + start = 0; + pr_warn("Failed to map boot instance %s to %s\n", name, tok); + continue; + } + rname = kstrdup(tok, GFP_KERNEL); + } + + if (start) { + /* Start and size must be page aligned */ + if (start & ~PAGE_MASK) { + pr_warn("Tracing: mapping start addr %pa is not page aligned\n", &start); + continue; + } + if (size & ~PAGE_MASK) { + pr_warn("Tracing: mapping size %pa is not page aligned\n", &size); + continue; + } + + if (memmap_area) + addr = map_pages(start, size); + else + addr = (unsigned long)phys_to_virt(start); + if (addr) { + pr_info("Tracing: mapped boot instance %s at physical memory %pa of size 0x%lx\n", + name, &start, (unsigned long)size); + } else { + pr_warn("Tracing: Failed to map boot instance %s\n", name); + continue; + } + } else { + /* Only non mapped buffers have snapshot buffers */ + if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE)) + do_allocate_snapshot(name); + } + + tr = trace_array_create_systems(name, NULL, addr, size); + if (IS_ERR(tr)) { + pr_warn("Tracing: Failed to create instance buffer %s\n", curr_str); + continue; + } + + if (traceoff) + tracer_tracing_off(tr); + + if (traceprintk) + update_printk_trace(tr); + + /* + * memmap'd buffers can not be freed. + */ + if (memmap_area) { + tr->flags |= TRACE_ARRAY_FL_MEMMAP; + tr->ref++; + } + + /* + * Backup buffers can be freed but need vfree(). + */ + if (backup) + tr->flags |= TRACE_ARRAY_FL_VMALLOC; + + if (start || backup) { + tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT; + tr->range_name = no_free_ptr(rname); + } + + while ((tok = strsep(&curr_str, ","))) { + early_enable_events(tr, tok, true); + } + } +} + __init static int tracer_alloc_buffers(void) { int ring_buf_size; int ret = -ENOMEM; + if (security_locked_down(LOCKDOWN_TRACEFS)) { + pr_warn("Tracing disabled due to lockdown\n"); + return -EPERM; + } + + /* + * Make sure we don't accidentally add more trace options + * than we have bits for. + */ + BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE); + if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) - goto out; + return -ENOMEM; - if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) + if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; /* Only allocate trace_printk buffers if a trace_printk exists */ - if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) + if (&__stop___trace_bprintk_fmt != &__start___trace_bprintk_fmt) /* Must be called before global_trace.buffer is allocated */ trace_printk_init_buffers(); /* To save memory, keep the ring buffer size to its minimum */ - if (ring_buffer_expanded) + if (global_trace.ring_buffer_expanded) ring_buf_size = trace_buf_size; else ring_buf_size = 1; cpumask_copy(tracing_buffer_mask, cpu_possible_mask); - cpumask_copy(tracing_cpumask, cpu_all_mask); + cpumask_copy(global_trace.tracing_cpumask, cpu_all_mask); raw_spin_lock_init(&global_trace.start_lock); + /* + * The prepare callbacks allocates some memory for the ring buffer. We + * don't free the buffer if the CPU goes down. If we were to free + * the buffer, then the user would lose any trace that was in the + * buffer. The memory will be removed once the "instance" is removed. + */ + ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE, + "trace/RB:prepare", trace_rb_cpu_prepare, + NULL); + if (ret < 0) + goto out_free_cpumask; + /* Used for event triggers */ + ret = -ENOMEM; + temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE); + if (!temp_buffer) + goto out_rm_hp_state; + + if (trace_create_savedcmd() < 0) + goto out_free_temp_buffer; + + if (!zalloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL)) + goto out_free_savedcmd; + /* TODO: make the number of buffers hot pluggable with CPUS */ if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { - printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); - WARN_ON(1); - goto out_free_cpumask; + MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n"); + goto out_free_pipe_cpumask; } - if (global_trace.buffer_disabled) tracing_off(); - trace_init_cmdlines(); + if (trace_boot_clock) { + ret = tracing_set_clock(&global_trace, trace_boot_clock); + if (ret < 0) + pr_warn("Trace clock %s not defined, going back to default\n", + trace_boot_clock); + } /* * register_tracer() might reference current_trace, so it @@ -6426,8 +11613,21 @@ __init static int tracer_alloc_buffers(void) * just a bootstrap of current_trace anyway. */ global_trace.current_trace = &nop_trace; + global_trace.current_trace_flags = nop_trace.flags; - register_tracer(&nop_trace); + global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +#ifdef CONFIG_TRACER_MAX_TRACE + spin_lock_init(&global_trace.snapshot_trigger_lock); +#endif + ftrace_init_global_array_ops(&global_trace); + +#ifdef CONFIG_MODULES + INIT_LIST_HEAD(&global_trace.mod_events); +#endif + + init_trace_flags_index(&global_trace); + + INIT_LIST_HEAD(&global_trace.tracers); /* All seems OK, enable tracing */ tracing_disabled = 0; @@ -6439,38 +11639,92 @@ __init static int tracer_alloc_buffers(void) global_trace.flags = TRACE_ARRAY_FL_GLOBAL; - /* Holder for file callbacks */ - global_trace.trace_cpu.cpu = RING_BUFFER_ALL_CPUS; - global_trace.trace_cpu.tr = &global_trace; + global_trace.syscall_buf_sz = syscall_buf_size; INIT_LIST_HEAD(&global_trace.systems); INIT_LIST_HEAD(&global_trace.events); + INIT_LIST_HEAD(&global_trace.hist_vars); + INIT_LIST_HEAD(&global_trace.err_log); + list_add(&global_trace.marker_list, &marker_copies); list_add(&global_trace.list, &ftrace_trace_arrays); - while (trace_boot_options) { - char *option; + register_tracer(&nop_trace); - option = strsep(&trace_boot_options, ","); - trace_set_options(&global_trace, option); - } + /* Function tracing may start here (via kernel command line) */ + init_function_trace(); + + apply_trace_boot_options(); register_snapshot_cmd(); return 0; +out_free_pipe_cpumask: + free_cpumask_var(global_trace.pipe_cpumask); +out_free_savedcmd: + trace_free_saved_cmdlines_buffer(); +out_free_temp_buffer: + ring_buffer_free(temp_buffer); +out_rm_hp_state: + cpuhp_remove_multi_state(CPUHP_TRACE_RB_PREPARE); out_free_cpumask: - free_percpu(global_trace.trace_buffer.data); -#ifdef CONFIG_TRACER_MAX_TRACE - free_percpu(global_trace.max_buffer.data); -#endif - free_cpumask_var(tracing_cpumask); + free_cpumask_var(global_trace.tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); -out: return ret; } -__init static int clear_boot_tracer(void) +#ifdef CONFIG_FUNCTION_TRACER +/* Used to set module cached ftrace filtering at boot up */ +struct trace_array *trace_get_global_array(void) +{ + return &global_trace; +} +#endif + +void __init ftrace_boot_snapshot(void) +{ +#ifdef CONFIG_TRACER_MAX_TRACE + struct trace_array *tr; + + if (!snapshot_at_boot) + return; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!tr->allocated_snapshot) + continue; + + tracing_snapshot_instance(tr); + trace_array_puts(tr, "** Boot snapshot taken **\n"); + } +#endif +} + +void __init early_trace_init(void) +{ + if (tracepoint_printk) { + tracepoint_print_iter = + kzalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL); + if (MEM_FAIL(!tracepoint_print_iter, + "Failed to allocate trace iterator\n")) + tracepoint_printk = 0; + else + static_key_enable(&tracepoint_printk_key.key); + } + tracer_alloc_buffers(); + + init_events(); +} + +void __init trace_init(void) +{ + trace_event_init(); + + if (boot_instance_index) + enable_instances(); +} + +__init static void clear_boot_tracer(void) { /* * The default tracer at boot buffer is an init section. @@ -6480,15 +11734,48 @@ __init static int clear_boot_tracer(void) * about to be freed. */ if (!default_bootup_tracer) - return 0; + return; printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n", default_bootup_tracer); default_bootup_tracer = NULL; +} + +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +__init static void tracing_set_default_clock(void) +{ + /* sched_clock_stable() is determined in late_initcall */ + if (!trace_boot_clock && !sched_clock_stable()) { + if (security_locked_down(LOCKDOWN_TRACEFS)) { + pr_warn("Can not set tracing clock due to lockdown\n"); + return; + } + + printk(KERN_WARNING + "Unstable clock detected, switching default tracing clock to \"global\"\n" + "If you want to keep using the local clock, then add:\n" + " \"trace_clock=local\"\n" + "on the kernel command line\n"); + tracing_set_clock(&global_trace, "global"); + } +} +#else +static inline void tracing_set_default_clock(void) { } +#endif + +__init static int late_trace_init(void) +{ + if (tracepoint_printk && tracepoint_printk_stop_on_boot) { + static_key_disable(&tracepoint_printk_key.key); + tracepoint_printk = 0; + } + + if (traceoff_after_boot) + tracing_off(); + tracing_set_default_clock(); + clear_boot_tracer(); return 0; } -early_initcall(tracer_alloc_buffers); -fs_initcall(tracer_init_debugfs); -late_initcall(clear_boot_tracer); +late_initcall_sync(late_trace_init); |
