From 4ccf11c4e8a8e051499d53a12f502196c97a758e Mon Sep 17 00:00:00 2001 From: Tze-nan Wu Date: Tue, 21 Mar 2023 19:04:43 +0800 Subject: tracing/synthetic: Fix races on freeing last_cmd Currently, the "last_cmd" variable can be accessed by multiple processes asynchronously when multiple users manipulate synthetic_events node at the same time, it could lead to use-after-free or double-free. This patch add "lastcmd_mutex" to prevent "last_cmd" from being accessed asynchronously. ================================================================ It's easy to reproduce in the KASAN environment by running the two scripts below in different shells. script 1: while : do echo -n -e '\x88' > /sys/kernel/tracing/synthetic_events done script 2: while : do echo -n -e '\xb0' > /sys/kernel/tracing/synthetic_events done ================================================================ double-free scenario: process A process B ------------------- --------------- 1.kstrdup last_cmd 2.free last_cmd 3.free last_cmd(double-free) ================================================================ use-after-free scenario: process A process B ------------------- --------------- 1.kstrdup last_cmd 2.free last_cmd 3.tracing_log_err(use-after-free) ================================================================ Appendix 1. KASAN report double-free: BUG: KASAN: double-free in kfree+0xdc/0x1d4 Free of addr ***** by task sh/4879 Call trace: ... kfree+0xdc/0x1d4 create_or_delete_synth_event+0x60/0x1e8 trace_parse_run_command+0x2bc/0x4b8 synth_events_write+0x20/0x30 vfs_write+0x200/0x830 ... Allocated by task 4879: ... kstrdup+0x5c/0x98 create_or_delete_synth_event+0x6c/0x1e8 trace_parse_run_command+0x2bc/0x4b8 synth_events_write+0x20/0x30 vfs_write+0x200/0x830 ... Freed by task 5464: ... kfree+0xdc/0x1d4 create_or_delete_synth_event+0x60/0x1e8 trace_parse_run_command+0x2bc/0x4b8 synth_events_write+0x20/0x30 vfs_write+0x200/0x830 ... ================================================================ Appendix 2. KASAN report use-after-free: BUG: KASAN: use-after-free in strlen+0x5c/0x7c Read of size 1 at addr ***** by task sh/5483 sh: CPU: 7 PID: 5483 Comm: sh ... __asan_report_load1_noabort+0x34/0x44 strlen+0x5c/0x7c tracing_log_err+0x60/0x444 create_or_delete_synth_event+0xc4/0x204 trace_parse_run_command+0x2bc/0x4b8 synth_events_write+0x20/0x30 vfs_write+0x200/0x830 ... Allocated by task 5483: ... kstrdup+0x5c/0x98 create_or_delete_synth_event+0x80/0x204 trace_parse_run_command+0x2bc/0x4b8 synth_events_write+0x20/0x30 vfs_write+0x200/0x830 ... Freed by task 5480: ... kfree+0xdc/0x1d4 create_or_delete_synth_event+0x74/0x204 trace_parse_run_command+0x2bc/0x4b8 synth_events_write+0x20/0x30 vfs_write+0x200/0x830 ... Link: https://lore.kernel.org/linux-trace-kernel/20230321110444.1587-1-Tze-nan.Wu@mediatek.com Fixes: 27c888da9867 ("tracing: Remove size restriction on synthetic event cmd error logging") Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Matthias Brugger Cc: AngeloGioacchino Del Regno Cc: "Tom Zanussi" Signed-off-by: Tze-nan Wu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_synth.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 46d0abb32d0f..f0ff730125bf 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -44,14 +44,21 @@ enum { ERRORS }; static const char *err_text[] = { ERRORS }; +DEFINE_MUTEX(lastcmd_mutex); static char *last_cmd; static int errpos(const char *str) { + int ret = 0; + + mutex_lock(&lastcmd_mutex); if (!str || !last_cmd) - return 0; + goto out; - return err_pos(last_cmd, str); + ret = err_pos(last_cmd, str); + out: + mutex_unlock(&lastcmd_mutex); + return ret; } static void last_cmd_set(const char *str) @@ -59,18 +66,22 @@ static void last_cmd_set(const char *str) if (!str) return; + mutex_lock(&lastcmd_mutex); kfree(last_cmd); - last_cmd = kstrdup(str, GFP_KERNEL); + mutex_unlock(&lastcmd_mutex); } static void synth_err(u8 err_type, u16 err_pos) { + mutex_lock(&lastcmd_mutex); if (!last_cmd) - return; + goto out; tracing_log_err(NULL, "synthetic_events", last_cmd, err_text, err_type, err_pos); + out: + mutex_unlock(&lastcmd_mutex); } static int create_synth_event(const char *raw_command); -- cgit From 6455b6163d8c680366663cdb8c679514d55fc30c Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Sat, 25 Mar 2023 10:12:47 +0800 Subject: ring-buffer: Fix race while reader and writer are on the same page When user reads file 'trace_pipe', kernel keeps printing following logs that warn at "cpu_buffer->reader_page->read > rb_page_size(reader)" in rb_get_reader_page(). It just looks like there's an infinite loop in tracing_read_pipe(). This problem occurs several times on arm64 platform when testing v5.10 and below. Call trace: rb_get_reader_page+0x248/0x1300 rb_buffer_peek+0x34/0x160 ring_buffer_peek+0xbc/0x224 peek_next_entry+0x98/0xbc __find_next_entry+0xc4/0x1c0 trace_find_next_entry_inc+0x30/0x94 tracing_read_pipe+0x198/0x304 vfs_read+0xb4/0x1e0 ksys_read+0x74/0x100 __arm64_sys_read+0x24/0x30 el0_svc_common.constprop.0+0x7c/0x1bc do_el0_svc+0x2c/0x94 el0_svc+0x20/0x30 el0_sync_handler+0xb0/0xb4 el0_sync+0x160/0x180 Then I dump the vmcore and look into the problematic per_cpu ring_buffer, I found that tail_page/commit_page/reader_page are on the same page while reader_page->read is obviously abnormal: tail_page == commit_page == reader_page == { .write = 0x100d20, .read = 0x8f9f4805, // Far greater than 0xd20, obviously abnormal!!! .entries = 0x10004c, .real_end = 0x0, .page = { .time_stamp = 0x857257416af0, .commit = 0xd20, // This page hasn't been full filled. // .data[0...0xd20] seems normal. } } The root cause is most likely the race that reader and writer are on the same page while reader saw an event that not fully committed by writer. To fix this, add memory barriers to make sure the reader can see the content of what is committed. Since commit a0fcaaed0c46 ("ring-buffer: Fix race between reset page and reading page") has added the read barrier in rb_get_reader_page(), here we just need to add the write barrier. Link: https://lore.kernel.org/linux-trace-kernel/20230325021247.2923907-1-zhengyejian1@huawei.com Cc: stable@vger.kernel.org Fixes: 77ae365eca89 ("ring-buffer: make lockless") Suggested-by: Steven Rostedt (Google) Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c6f47b6cfd5f..76a2d91eecad 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3098,6 +3098,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) if (RB_WARN_ON(cpu_buffer, rb_is_reader_page(cpu_buffer->tail_page))) return; + /* + * No need for a memory barrier here, as the update + * of the tail_page did it for this page. + */ local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); rb_inc_page(&cpu_buffer->commit_page); @@ -3107,6 +3111,8 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) while (rb_commit_index(cpu_buffer) != rb_page_write(cpu_buffer->commit_page)) { + /* Make sure the readers see the content of what is committed. */ + smp_wmb(); local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); RB_WARN_ON(cpu_buffer, @@ -4684,7 +4690,12 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) /* * Make sure we see any padding after the write update - * (see rb_reset_tail()) + * (see rb_reset_tail()). + * + * In addition, a writer may be writing on the reader page + * if the page has not been fully filled, so the read barrier + * is also needed to make sure we see the content of what is + * committed by the writer (see rb_set_commit_to_write()). */ smp_rmb(); -- cgit From b9f451a9029a16eb7913ace09b92493d00f2e564 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Wed, 29 Mar 2023 17:50:15 +0200 Subject: tracing/timerlat: Notify new max thread latency timerlat is not reporting a new tracing_max_latency for the thread latency. The reason is that it is not calling notify_new_max_latency() function after the new thread latency is sampled. Call notify_new_max_latency() after computing the thread latency. Link: https://lkml.kernel.org/r/16e18d61d69073d0192ace07bf61e405cca96e9c.1680104184.git.bristot@kernel.org Cc: stable@vger.kernel.org Fixes: dae181349f1e ("tracing/osnoise: Support a list of trace_array *tr") Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 9176bb7a9bb4..e8116094bed8 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1738,6 +1738,8 @@ static int timerlat_main(void *data) trace_timerlat_sample(&s); + notify_new_max_latency(diff); + timerlat_dump_stack(time_to_us(diff)); tlat->tracing_thread = false; -- cgit From d3cba7f02cd82118c32651c73374d8a5a459d9a6 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Wed, 29 Mar 2023 17:50:16 +0200 Subject: tracing/osnoise: Fix notify new tracing_max_latency osnoise/timerlat tracers are reporting new max latency on instances where the tracing is off, creating inconsistencies between the max reported values in the trace and in the tracing_max_latency. Thus only report new tracing_max_latency on active tracing instances. Link: https://lkml.kernel.org/r/ecd109fde4a0c24ab0f00ba1e9a144ac19a91322.1680104184.git.bristot@kernel.org Cc: stable@vger.kernel.org Fixes: dae181349f1e ("tracing/osnoise: Support a list of trace_array *tr") Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index e8116094bed8..4496975f2029 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1296,7 +1296,7 @@ static void notify_new_max_latency(u64 latency) rcu_read_lock(); list_for_each_entry_rcu(inst, &osnoise_instances, list) { tr = inst->tr; - if (tr->max_latency < latency) { + if (tracer_tracing_is_on(tr) && tr->max_latency < latency) { tr->max_latency = latency; latency_fsnotify(tr); } -- cgit From 9d52727f8043cfda241ae96896628d92fa9c50bb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 4 Apr 2023 22:21:14 -0400 Subject: tracing: Have tracing_snapshot_instance_cond() write errors to the appropriate instance If a trace instance has a failure with its snapshot code, the error message is to be written to that instance's buffer. But currently, the message is written to the top level buffer. Worse yet, it may also disable the top level buffer and not the instance that had the issue. Link: https://lkml.kernel.org/r/20230405022341.688730321@goodmis.org Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Cc: Ross Zwisler Fixes: 2824f50332486 ("tracing: Make the snapshot trigger work with instances") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 937e9676dfd4..ed1d1093f5e9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1149,22 +1149,22 @@ static void tracing_snapshot_instance_cond(struct trace_array *tr, unsigned long flags; if (in_nmi()) { - internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); - internal_trace_puts("*** snapshot is being ignored ***\n"); + trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); + trace_array_puts(tr, "*** snapshot is being ignored ***\n"); return; } if (!tr->allocated_snapshot) { - internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n"); - internal_trace_puts("*** stopping trace here! ***\n"); - tracing_off(); + trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n"); + trace_array_puts(tr, "*** stopping trace here! ***\n"); + tracer_tracing_off(tr); return; } /* Note, snapshot can not be used when the tracer uses it */ if (tracer->use_max_tr) { - internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n"); - internal_trace_puts("*** Can not use snapshot (sorry) ***\n"); + trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n"); + trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); return; } -- cgit From e94891641c21f607e4d6887bcd3beff882fcc483 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 4 Apr 2023 22:21:15 -0400 Subject: tracing: Fix ftrace_boot_snapshot command line logic The kernel command line ftrace_boot_snapshot by itself is supposed to trigger a snapshot at the end of boot up of the main top level trace buffer. A ftrace_boot_snapshot=foo will do the same for an instance called foo that was created by trace_instance=foo,... The logic was broken where if ftrace_boot_snapshot was by itself, it would trigger a snapshot for all instances that had tracing enabled, regardless if it asked for a snapshot or not. When a snapshot is requested for a buffer, the buffer's tr->allocated_snapshot is set to true. Use that to know if a trace buffer wants a snapshot at boot up or not. Since the top level buffer is part of the ftrace_trace_arrays list, there's no reason to treat it differently than the other buffers. Just iterate the list if ftrace_boot_snapshot was specified. Link: https://lkml.kernel.org/r/20230405022341.895334039@goodmis.org Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Cc: Ross Zwisler Fixes: 9c1c251d670bc ("tracing: Allow boot instances to have snapshot buffers") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ed1d1093f5e9..4686473b8497 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -10393,19 +10393,20 @@ out: void __init ftrace_boot_snapshot(void) { +#ifdef CONFIG_TRACER_MAX_TRACE struct trace_array *tr; - if (snapshot_at_boot) { - tracing_snapshot(); - internal_trace_puts("** Boot snapshot taken **\n"); - } + if (!snapshot_at_boot) + return; list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (tr == &global_trace) + if (!tr->allocated_snapshot) continue; - trace_array_puts(tr, "** Boot snapshot taken **\n"); + tracing_snapshot_instance(tr); + trace_array_puts(tr, "** Boot snapshot taken **\n"); } +#endif } void __init early_trace_init(void) -- cgit From 3357c6e429643231e60447b52ffbb7ac895aca22 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 4 Apr 2023 19:45:04 -0400 Subject: tracing: Free error logs of tracing instances When a tracing instance is removed, the error messages that hold errors that occurred in the instance needs to be freed. The following reports a memory leak: # cd /sys/kernel/tracing # mkdir instances/foo # echo 'hist:keys=x' > instances/foo/events/sched/sched_switch/trigger # cat instances/foo/error_log [ 117.404795] hist:sched:sched_switch: error: Couldn't find field Command: hist:keys=x ^ # rmdir instances/foo Then check for memory leaks: # echo scan > /sys/kernel/debug/kmemleak # cat /sys/kernel/debug/kmemleak unreferenced object 0xffff88810d8ec700 (size 192): comm "bash", pid 869, jiffies 4294950577 (age 215.752s) hex dump (first 32 bytes): 60 dd 68 61 81 88 ff ff 60 dd 68 61 81 88 ff ff `.ha....`.ha.... a0 30 8c 83 ff ff ff ff 26 00 0a 00 00 00 00 00 .0......&....... backtrace: [<00000000dae26536>] kmalloc_trace+0x2a/0xa0 [<00000000b2938940>] tracing_log_err+0x277/0x2e0 [<000000004a0e1b07>] parse_atom+0x966/0xb40 [<0000000023b24337>] parse_expr+0x5f3/0xdb0 [<00000000594ad074>] event_hist_trigger_parse+0x27f8/0x3560 [<00000000293a9645>] trigger_process_regex+0x135/0x1a0 [<000000005c22b4f2>] event_trigger_write+0x87/0xf0 [<000000002cadc509>] vfs_write+0x162/0x670 [<0000000059c3b9be>] ksys_write+0xca/0x170 [<00000000f1cddc00>] do_syscall_64+0x3e/0xc0 [<00000000868ac68c>] entry_SYSCALL_64_after_hwframe+0x72/0xdc unreferenced object 0xffff888170c35a00 (size 32): comm "bash", pid 869, jiffies 4294950577 (age 215.752s) hex dump (first 32 bytes): 0a 20 20 43 6f 6d 6d 61 6e 64 3a 20 68 69 73 74 . Command: hist 3a 6b 65 79 73 3d 78 0a 00 00 00 00 00 00 00 00 :keys=x......... backtrace: [<000000006a747de5>] __kmalloc+0x4d/0x160 [<000000000039df5f>] tracing_log_err+0x29b/0x2e0 [<000000004a0e1b07>] parse_atom+0x966/0xb40 [<0000000023b24337>] parse_expr+0x5f3/0xdb0 [<00000000594ad074>] event_hist_trigger_parse+0x27f8/0x3560 [<00000000293a9645>] trigger_process_regex+0x135/0x1a0 [<000000005c22b4f2>] event_trigger_write+0x87/0xf0 [<000000002cadc509>] vfs_write+0x162/0x670 [<0000000059c3b9be>] ksys_write+0xca/0x170 [<00000000f1cddc00>] do_syscall_64+0x3e/0xc0 [<00000000868ac68c>] entry_SYSCALL_64_after_hwframe+0x72/0xdc The problem is that the error log needs to be freed when the instance is removed. Link: https://lore.kernel.org/lkml/76134d9f-a5ba-6a0d-37b3-28310b4a1e91@alu.unizg.hr/ Link: https://lore.kernel.org/linux-trace-kernel/20230404194504.5790b95f@gandalf.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Mark Rutland Cc: Thorsten Leemhuis Cc: Ulf Hansson Cc: Eric Biggers Fixes: 2f754e771b1a6 ("tracing: Have the error logs show up in the proper instances") Reported-by: Mirsad Goran Todorovac Tested-by: Mirsad Todorovac Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4686473b8497..36a6037823cd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9516,6 +9516,7 @@ static int __remove_instance(struct trace_array *tr) tracefs_remove(tr->dir); free_percpu(tr->last_func_repeats); free_trace_buffers(tr); + clear_tracing_err_log(tr); for (i = 0; i < tr->nr_topts; i++) { kfree(tr->topts[i].topts); -- cgit