summaryrefslogtreecommitdiff
path: root/kernel/trace
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/Kconfig55
-rw-r--r--kernel/trace/blktrace.c39
-rw-r--r--kernel/trace/bpf_trace.c594
-rw-r--r--kernel/trace/bpf_trace.h2
-rw-r--r--kernel/trace/fgraph.c1222
-rw-r--r--kernel/trace/fprobe.c662
-rw-r--r--kernel/trace/ftrace.c1238
-rw-r--r--kernel/trace/ftrace_internal.h18
-rw-r--r--kernel/trace/pid_list.c7
-rw-r--r--kernel/trace/preemptirq_delay_test.c3
-rw-r--r--kernel/trace/rethook.c4
-rw-r--r--kernel/trace/ring_buffer.c1788
-rw-r--r--kernel/trace/ring_buffer_benchmark.c4
-rw-r--r--kernel/trace/rv/Kconfig27
-rw-r--r--kernel/trace/rv/Makefile3
-rw-r--r--kernel/trace/rv/monitors/wip/Kconfig12
-rw-r--r--kernel/trace/rv/monitors/wip/wip.c2
-rw-r--r--kernel/trace/rv/monitors/wip/wip_trace.h15
-rw-r--r--kernel/trace/rv/monitors/wwnr/Kconfig11
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.c2
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr_trace.h16
-rw-r--r--kernel/trace/rv/rv.c9
-rw-r--r--kernel/trace/rv/rv_reactors.c1
-rw-r--r--kernel/trace/rv/rv_trace.h130
-rw-r--r--kernel/trace/trace.c1925
-rw-r--r--kernel/trace/trace.h197
-rw-r--r--kernel/trace/trace_benchmark.c5
-rw-r--r--kernel/trace/trace_branch.c10
-rw-r--r--kernel/trace/trace_clock.c2
-rw-r--r--kernel/trace/trace_dynevent.c23
-rw-r--r--kernel/trace/trace_entries.h37
-rw-r--r--kernel/trace/trace_eprobe.c46
-rw-r--r--kernel/trace/trace_event_perf.c6
-rw-r--r--kernel/trace/trace_events.c766
-rw-r--r--kernel/trace/trace_events_filter.c31
-rw-r--r--kernel/trace/trace_events_hist.c176
-rw-r--r--kernel/trace/trace_events_inject.c2
-rw-r--r--kernel/trace/trace_events_synth.c17
-rw-r--r--kernel/trace/trace_events_trigger.c134
-rw-r--r--kernel/trace/trace_events_user.c292
-rw-r--r--kernel/trace/trace_fprobe.c486
-rw-r--r--kernel/trace/trace_functions.c58
-rw-r--r--kernel/trace/trace_functions_graph.c437
-rw-r--r--kernel/trace/trace_hwlat.c6
-rw-r--r--kernel/trace/trace_irqsoff.c29
-rw-r--r--kernel/trace/trace_kdb.c13
-rw-r--r--kernel/trace/trace_kprobe.c417
-rw-r--r--kernel/trace/trace_mmiotrace.c8
-rw-r--r--kernel/trace/trace_osnoise.c123
-rw-r--r--kernel/trace/trace_output.c42
-rw-r--r--kernel/trace/trace_preemptirq.c37
-rw-r--r--kernel/trace/trace_probe.c493
-rw-r--r--kernel/trace/trace_probe.h37
-rw-r--r--kernel/trace/trace_probe_tmpl.h10
-rw-r--r--kernel/trace/trace_sched_switch.c515
-rw-r--r--kernel/trace/trace_sched_wakeup.c40
-rw-r--r--kernel/trace/trace_selftest.c280
-rw-r--r--kernel/trace/trace_stack.c8
-rw-r--r--kernel/trace/trace_stat.c26
-rw-r--r--kernel/trace/trace_syscalls.c40
-rw-r--r--kernel/trace/trace_uprobe.c225
-rw-r--r--kernel/trace/tracing_map.c12
62 files changed, 9201 insertions, 3674 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 61c541c36596..d570b8b9c0a9 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -31,9 +31,14 @@ config HAVE_FUNCTION_GRAPH_TRACER
help
See Documentation/trace/ftrace-design.rst
-config HAVE_FUNCTION_GRAPH_RETVAL
+config HAVE_FUNCTION_GRAPH_FREGS
bool
+config HAVE_FTRACE_GRAPH_FUNC
+ bool
+ help
+ True if ftrace_graph_func() is defined.
+
config HAVE_DYNAMIC_FTRACE
bool
help
@@ -57,6 +62,12 @@ config HAVE_DYNAMIC_FTRACE_WITH_ARGS
This allows for use of ftrace_regs_get_argument() and
ftrace_regs_get_stack_pointer().
+config HAVE_FTRACE_REGS_HAVING_PT_REGS
+ bool
+ help
+ If this is set, ftrace_regs has pt_regs, thus it can convert to
+ pt_regs without allocating memory.
+
config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
bool
help
@@ -163,7 +174,7 @@ config TRACING
select BINARY_PRINTF
select EVENT_TRACING
select TRACE_CLOCK
- select TASKS_RCU if PREEMPTION
+ select NEED_TASKS_RCU
config GENERIC_TRACER
bool
@@ -204,7 +215,7 @@ config FUNCTION_TRACER
select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
select GLOB
- select TASKS_RCU if PREEMPTION
+ select NEED_TASKS_RCU
select TASKS_RUDE_RCU
help
Enable the kernel to trace every kernel function. This is done
@@ -232,7 +243,7 @@ config FUNCTION_GRAPH_TRACER
config FUNCTION_GRAPH_RETVAL
bool "Kernel Function Graph Return Value"
- depends on HAVE_FUNCTION_GRAPH_RETVAL
+ depends on HAVE_FUNCTION_GRAPH_FREGS
depends on FUNCTION_GRAPH_TRACER
default n
help
@@ -242,6 +253,16 @@ config FUNCTION_GRAPH_RETVAL
enable it via the trace option funcgraph-retval.
See Documentation/trace/ftrace.rst
+config FUNCTION_GRAPH_RETADDR
+ bool "Kernel Function Graph Return Address"
+ depends on FUNCTION_GRAPH_TRACER
+ default n
+ help
+ Support recording and printing the function return address when
+ using function graph tracer. It can be helpful to locate code line that
+ the function is called. This feature is off by default, and you can
+ enable it via the trace option funcgraph-retaddr.
+
config DYNAMIC_FTRACE
bool "enable/disable function tracing dynamically"
depends on FUNCTION_TRACER
@@ -286,10 +307,9 @@ config DYNAMIC_FTRACE_WITH_ARGS
config FPROBE
bool "Kernel Function Probe (fprobe)"
- depends on FUNCTION_TRACER
- depends on DYNAMIC_FTRACE_WITH_REGS
- depends on HAVE_RETHOOK
- select RETHOOK
+ depends on HAVE_FUNCTION_GRAPH_FREGS && HAVE_FTRACE_GRAPH_FUNC
+ depends on DYNAMIC_FTRACE_WITH_ARGS
+ select FUNCTION_GRAPH_TRACER
default n
help
This option enables kernel function probe (fprobe) based on ftrace.
@@ -965,7 +985,7 @@ config FTRACE_RECORD_RECURSION
config FTRACE_RECORD_RECURSION_SIZE
int "Max number of recursed functions to record"
- default 128
+ default 128
depends on FTRACE_RECORD_RECURSION
help
This defines the limit of number of functions that can be
@@ -974,6 +994,19 @@ config FTRACE_RECORD_RECURSION_SIZE
This file can be reset, but the limit can not change in
size at runtime.
+config FTRACE_VALIDATE_RCU_IS_WATCHING
+ bool "Validate RCU is on during ftrace execution"
+ depends on FUNCTION_TRACER
+ depends on ARCH_WANTS_NO_INSTR
+ help
+ All callbacks that attach to the function tracing have some sort of
+ protection against recursion. This option is only to verify that
+ ftrace (and other users of ftrace_test_recursion_trylock()) are not
+ called outside of RCU, as if they are, it can cause a race. But it
+ also has a noticeable overhead when enabled.
+
+ If unsure, say N
+
config RING_BUFFER_RECORD_RECURSION
bool "Record functions that recurse in the ring buffer"
depends on FTRACE_RECORD_RECURSION
@@ -1123,7 +1156,7 @@ config PREEMPTIRQ_DELAY_TEST
config SYNTH_EVENT_GEN_TEST
tristate "Test module for in-kernel synthetic event generation"
- depends on SYNTH_EVENTS
+ depends on SYNTH_EVENTS && m
help
This option creates a test module to check the base
functionality of in-kernel synthetic event definition and
@@ -1136,7 +1169,7 @@ config SYNTH_EVENT_GEN_TEST
config KPROBE_EVENT_GEN_TEST
tristate "Test module for in-kernel kprobe event generation"
- depends on KPROBE_EVENTS
+ depends on KPROBE_EVENTS && m
help
This option creates a test module to check the base
functionality of in-kernel kprobe event definition.
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d5d94510afd3..3679a6d18934 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -524,8 +524,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (!buts->buf_size || !buts->buf_nr)
return -EINVAL;
- strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
- buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
+ strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE);
/*
* some device names have larger paths - convert the slashes
@@ -618,8 +617,9 @@ err:
return ret;
}
-static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
- struct block_device *bdev, char __user *arg)
+int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+ struct block_device *bdev,
+ char __user *arg)
{
struct blk_user_trace_setup buts;
int ret;
@@ -628,29 +628,18 @@ static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (ret)
return -EFAULT;
+ mutex_lock(&q->debugfs_mutex);
ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
+ mutex_unlock(&q->debugfs_mutex);
if (ret)
return ret;
if (copy_to_user(arg, &buts, sizeof(buts))) {
- __blk_trace_remove(q);
+ blk_trace_remove(q);
return -EFAULT;
}
return 0;
}
-
-int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
- struct block_device *bdev,
- char __user *arg)
-{
- int ret;
-
- mutex_lock(&q->debugfs_mutex);
- ret = __blk_trace_setup(q, name, dev, bdev, arg);
- mutex_unlock(&q->debugfs_mutex);
-
- return ret;
-}
EXPORT_SYMBOL_GPL(blk_trace_setup);
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
@@ -674,12 +663,14 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
.pid = cbuts.pid,
};
+ mutex_lock(&q->debugfs_mutex);
ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
+ mutex_unlock(&q->debugfs_mutex);
if (ret)
return ret;
if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
- __blk_trace_remove(q);
+ blk_trace_remove(q);
return -EFAULT;
}
@@ -733,12 +724,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
int ret, start = 0;
char b[BDEVNAME_SIZE];
- mutex_lock(&q->debugfs_mutex);
-
switch (cmd) {
case BLKTRACESETUP:
snprintf(b, sizeof(b), "%pg", bdev);
- ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
+ ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
break;
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
case BLKTRACESETUP32:
@@ -750,17 +739,15 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
start = 1;
fallthrough;
case BLKTRACESTOP:
- ret = __blk_trace_startstop(q, start);
+ ret = blk_trace_startstop(q, start);
break;
case BLKTRACETEARDOWN:
- ret = __blk_trace_remove(q);
+ ret = blk_trace_remove(q);
break;
default:
ret = -ENOTTY;
break;
}
-
- mutex_unlock(&q->debugfs_mutex);
return ret;
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 7ac6c52b25eb..adc947587eb8 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -24,7 +24,6 @@
#include <linux/key.h>
#include <linux/verification.h>
#include <linux/namei.h>
-#include <linux/fileattr.h>
#include <net/bpf_sk_storage.h>
@@ -358,17 +357,6 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = {
.arg3_type = ARG_CONST_SIZE,
};
-static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
-{
- if (!capable(CAP_SYS_ADMIN))
- return NULL;
-
- pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
- current->comm, task_pid_nr(current));
-
- return &bpf_probe_write_user_proto;
-}
-
#define MAX_TRACE_PRINTK_VARARGS 3
#define BPF_TRACE_PRINTK_SIZE 1024
@@ -620,7 +608,8 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
static __always_inline u64
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
- u64 flags, struct perf_sample_data *sd)
+ u64 flags, struct perf_raw_record *raw,
+ struct perf_sample_data *sd)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
unsigned int cpu = smp_processor_id();
@@ -645,6 +634,8 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
if (unlikely(event->oncpu != cpu))
return -EOPNOTSUPP;
+ perf_sample_save_raw_data(sd, event, raw);
+
return perf_event_output(event, sd, regs);
}
@@ -688,9 +679,8 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
}
perf_sample_data_init(sd, 0, 0);
- perf_sample_save_raw_data(sd, &raw);
- err = __bpf_perf_event_output(regs, map, flags, sd);
+ err = __bpf_perf_event_output(regs, map, flags, &raw, sd);
out:
this_cpu_dec(bpf_trace_nest_level);
preempt_enable();
@@ -749,9 +739,8 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
perf_fetch_caller_regs(regs);
perf_sample_data_init(sd, 0, 0);
- perf_sample_save_raw_data(sd, &raw);
- ret = __bpf_perf_event_output(regs, map, flags, sd);
+ ret = __bpf_perf_event_output(regs, map, flags, &raw, sd);
out:
this_cpu_dec(bpf_event_output_nest_level);
preempt_enable();
@@ -798,34 +787,13 @@ const struct bpf_func_proto bpf_task_pt_regs_proto = {
.ret_btf_id = &bpf_task_pt_regs_ids[0],
};
-BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
-{
- struct bpf_array *array = container_of(map, struct bpf_array, map);
- struct cgroup *cgrp;
-
- if (unlikely(idx >= array->map.max_entries))
- return -E2BIG;
-
- cgrp = READ_ONCE(array->ptrs[idx]);
- if (unlikely(!cgrp))
- return -EAGAIN;
-
- return task_under_cgroup_hierarchy(current, cgrp);
-}
-
-static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
- .func = bpf_current_task_under_cgroup,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_ANYTHING,
-};
-
struct send_signal_irq_work {
struct irq_work irq_work;
struct task_struct *task;
u32 sig;
enum pid_type type;
+ bool has_siginfo;
+ struct kernel_siginfo info;
};
static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
@@ -833,30 +801,49 @@ static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
static void do_bpf_send_signal(struct irq_work *entry)
{
struct send_signal_irq_work *work;
+ struct kernel_siginfo *siginfo;
work = container_of(entry, struct send_signal_irq_work, irq_work);
- group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type);
+ siginfo = work->has_siginfo ? &work->info : SEND_SIG_PRIV;
+
+ group_send_sig_info(work->sig, siginfo, work->task, work->type);
put_task_struct(work->task);
}
-static int bpf_send_signal_common(u32 sig, enum pid_type type)
+static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struct *task, u64 value)
{
struct send_signal_irq_work *work = NULL;
+ struct kernel_siginfo info;
+ struct kernel_siginfo *siginfo;
+
+ if (!task) {
+ task = current;
+ siginfo = SEND_SIG_PRIV;
+ } else {
+ clear_siginfo(&info);
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = SI_KERNEL;
+ info.si_pid = 0;
+ info.si_uid = 0;
+ info.si_value.sival_ptr = (void *)(unsigned long)value;
+ siginfo = &info;
+ }
/* Similar to bpf_probe_write_user, task needs to be
* in a sound condition and kernel memory access be
* permitted in order to send signal to the current
* task.
*/
- if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING)))
+ if (unlikely(task->flags & (PF_KTHREAD | PF_EXITING)))
return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
/* Task should not be pid=1 to avoid kernel panic. */
- if (unlikely(is_global_init(current)))
+ if (unlikely(is_global_init(task)))
return -EPERM;
- if (irqs_disabled()) {
+ if (!preemptible()) {
/* Do an early check on signal validity. Otherwise,
* the error is lost in deferred irq_work.
*/
@@ -871,19 +858,22 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
* to the irq_work. The current task may change when queued
* irq works get executed.
*/
- work->task = get_task_struct(current);
+ work->task = get_task_struct(task);
+ work->has_siginfo = siginfo == &info;
+ if (work->has_siginfo)
+ copy_siginfo(&work->info, &info);
work->sig = sig;
work->type = type;
irq_work_queue(&work->irq_work);
return 0;
}
- return group_send_sig_info(sig, SEND_SIG_PRIV, current, type);
+ return group_send_sig_info(sig, siginfo, task, type);
}
BPF_CALL_1(bpf_send_signal, u32, sig)
{
- return bpf_send_signal_common(sig, PIDTYPE_TGID);
+ return bpf_send_signal_common(sig, PIDTYPE_TGID, NULL, 0);
}
static const struct bpf_func_proto bpf_send_signal_proto = {
@@ -895,7 +885,7 @@ static const struct bpf_func_proto bpf_send_signal_proto = {
BPF_CALL_1(bpf_send_signal_thread, u32, sig)
{
- return bpf_send_signal_common(sig, PIDTYPE_PID);
+ return bpf_send_signal_common(sig, PIDTYPE_PID, NULL, 0);
}
static const struct bpf_func_proto bpf_send_signal_thread_proto = {
@@ -1053,9 +1043,15 @@ static unsigned long get_entry_ip(unsigned long fentry_ip)
{
u32 instr;
- /* Being extra safe in here in case entry ip is on the page-edge. */
- if (get_kernel_nofault(instr, (u32 *) fentry_ip - 1))
- return fentry_ip;
+ /* We want to be extra safe in case entry ip is on the page edge,
+ * but otherwise we need to avoid get_kernel_nofault()'s overhead.
+ */
+ if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) {
+ if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE)))
+ return fentry_ip;
+ } else {
+ instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE);
+ }
if (is_endbr(instr))
fentry_ip -= ENDBR_INSN_SIZE;
return fentry_ip;
@@ -1182,9 +1178,6 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = {
BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags)
{
-#ifndef CONFIG_X86
- return -ENOENT;
-#else
static const u32 br_entry_size = sizeof(struct perf_branch_entry);
u32 entry_cnt = size / br_entry_size;
@@ -1197,7 +1190,6 @@ BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags)
return -ENOENT;
return entry_cnt * br_entry_size;
-#endif
}
static const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
@@ -1224,7 +1216,8 @@ static const struct bpf_func_proto bpf_get_func_arg_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_LONG,
+ .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
+ .arg3_size = sizeof(u64),
};
BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value)
@@ -1240,7 +1233,8 @@ static const struct bpf_func_proto bpf_get_func_ret_proto = {
.func = get_func_ret,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_LONG,
+ .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
+ .arg2_size = sizeof(u64),
};
BPF_CALL_1(get_func_arg_cnt, void *, ctx)
@@ -1367,8 +1361,8 @@ __bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
/**
* bpf_verify_pkcs7_signature - verify a PKCS#7 signature
- * @data_ptr: data to verify
- * @sig_ptr: signature of the data
+ * @data_p: data to verify
+ * @sig_p: signature of the data
* @trusted_keyring: keyring with keys trusted for signature verification
*
* Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr*
@@ -1376,10 +1370,12 @@ __bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
*
* Return: 0 on success, a negative value on error.
*/
-__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
- struct bpf_dynptr_kern *sig_ptr,
+__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
+ struct bpf_dynptr *sig_p,
struct bpf_key *trusted_keyring)
{
+ struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p;
+ struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p;
const void *data, *sig;
u32 data_len, sig_len;
int ret;
@@ -1412,14 +1408,14 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
__bpf_kfunc_end_defs();
-BTF_SET8_START(key_sig_kfunc_set)
+BTF_KFUNCS_START(key_sig_kfunc_set)
BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
#endif
-BTF_SET8_END(key_sig_kfunc_set)
+BTF_KFUNCS_END(key_sig_kfunc_set)
static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = {
.owner = THIS_MODULE,
@@ -1435,75 +1431,11 @@ static int __init bpf_key_sig_kfuncs_init(void)
late_initcall(bpf_key_sig_kfuncs_init);
#endif /* CONFIG_KEYS */
-/* filesystem kfuncs */
-__bpf_kfunc_start_defs();
-
-/**
- * bpf_get_file_xattr - get xattr of a file
- * @file: file to get xattr from
- * @name__str: name of the xattr
- * @value_ptr: output buffer of the xattr value
- *
- * Get xattr *name__str* of *file* and store the output in *value_ptr*.
- *
- * For security reasons, only *name__str* with prefix "user." is allowed.
- *
- * Return: 0 on success, a negative value on error.
- */
-__bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str,
- struct bpf_dynptr_kern *value_ptr)
-{
- struct dentry *dentry;
- u32 value_len;
- void *value;
- int ret;
-
- if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
- return -EPERM;
-
- value_len = __bpf_dynptr_size(value_ptr);
- value = __bpf_dynptr_data_rw(value_ptr, value_len);
- if (!value)
- return -EINVAL;
-
- dentry = file_dentry(file);
- ret = inode_permission(&nop_mnt_idmap, dentry->d_inode, MAY_READ);
- if (ret)
- return ret;
- return __vfs_getxattr(dentry, dentry->d_inode, name__str, value, value_len);
-}
-
-__bpf_kfunc_end_defs();
-
-BTF_SET8_START(fs_kfunc_set_ids)
-BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
-BTF_SET8_END(fs_kfunc_set_ids)
-
-static int bpf_get_file_xattr_filter(const struct bpf_prog *prog, u32 kfunc_id)
-{
- if (!btf_id_set8_contains(&fs_kfunc_set_ids, kfunc_id))
- return 0;
-
- /* Only allow to attach from LSM hooks, to avoid recursion */
- return prog->type != BPF_PROG_TYPE_LSM ? -EACCES : 0;
-}
-
-static const struct btf_kfunc_id_set bpf_fs_kfunc_set = {
- .owner = THIS_MODULE,
- .set = &fs_kfunc_set_ids,
- .filter = bpf_get_file_xattr_filter,
-};
-
-static int __init bpf_fs_kfuncs_init(void)
-{
- return register_btf_kfunc_id_set(BPF_PROG_TYPE_LSM, &bpf_fs_kfunc_set);
-}
-
-late_initcall(bpf_fs_kfuncs_init);
-
static const struct bpf_func_proto *
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
switch (func_id) {
case BPF_FUNC_map_lookup_elem:
return &bpf_map_lookup_elem_proto;
@@ -1525,8 +1457,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_ktime_get_boot_ns_proto;
case BPF_FUNC_tail_call:
return &bpf_tail_call_proto;
- case BPF_FUNC_get_current_pid_tgid:
- return &bpf_get_current_pid_tgid_proto;
case BPF_FUNC_get_current_task:
return &bpf_get_current_task_proto;
case BPF_FUNC_get_current_task_btf:
@@ -1545,13 +1475,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_numa_node_id_proto;
case BPF_FUNC_perf_event_read:
return &bpf_perf_event_read_proto;
- case BPF_FUNC_current_task_under_cgroup:
- return &bpf_current_task_under_cgroup_proto;
case BPF_FUNC_get_prandom_u32:
return &bpf_get_prandom_u32_proto;
- case BPF_FUNC_probe_write_user:
- return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ?
- NULL : bpf_get_probe_write_proto();
case BPF_FUNC_probe_read_user:
return &bpf_probe_read_user_proto;
case BPF_FUNC_probe_read_kernel:
@@ -1575,6 +1500,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_cgrp_storage_get_proto;
case BPF_FUNC_cgrp_storage_delete:
return &bpf_cgrp_storage_delete_proto;
+ case BPF_FUNC_current_task_under_cgroup:
+ return &bpf_current_task_under_cgroup_proto;
#endif
case BPF_FUNC_send_signal:
return &bpf_send_signal_proto;
@@ -1582,8 +1509,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_send_signal_thread_proto;
case BPF_FUNC_perf_event_read_value:
return &bpf_perf_event_read_value_proto;
- case BPF_FUNC_get_ns_current_pid_tgid:
- return &bpf_get_ns_current_pid_tgid_proto;
case BPF_FUNC_ringbuf_output:
return &bpf_ringbuf_output_proto;
case BPF_FUNC_ringbuf_reserve:
@@ -1597,7 +1522,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_jiffies64:
return &bpf_jiffies64_proto;
case BPF_FUNC_get_task_stack:
- return &bpf_get_task_stack_proto;
+ return prog->sleepable ? &bpf_get_task_stack_sleepable_proto
+ : &bpf_get_task_stack_proto;
case BPF_FUNC_copy_from_user:
return &bpf_copy_from_user_proto;
case BPF_FUNC_copy_from_user_task:
@@ -1629,8 +1555,45 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_trace_vprintk:
return bpf_get_trace_vprintk_proto();
default:
- return bpf_base_func_proto(func_id);
+ break;
}
+
+ func_proto = bpf_base_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ if (!bpf_token_capable(prog->aux->token, CAP_SYS_ADMIN))
+ return NULL;
+
+ switch (func_id) {
+ case BPF_FUNC_probe_write_user:
+ return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ?
+ NULL : &bpf_probe_write_user_proto;
+ default:
+ return NULL;
+ }
+}
+
+static bool is_kprobe_multi(const struct bpf_prog *prog)
+{
+ return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ||
+ prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
+}
+
+static inline bool is_kprobe_session(const struct bpf_prog *prog)
+{
+ return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
+}
+
+static inline bool is_uprobe_multi(const struct bpf_prog *prog)
+{
+ return prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI ||
+ prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION;
+}
+
+static inline bool is_uprobe_session(const struct bpf_prog *prog)
+{
+ return prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION;
}
static const struct bpf_func_proto *
@@ -1642,21 +1605,21 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto;
case BPF_FUNC_get_stack:
- return &bpf_get_stack_proto;
+ return prog->sleepable ? &bpf_get_stack_sleepable_proto : &bpf_get_stack_proto;
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
case BPF_FUNC_override_return:
return &bpf_override_return_proto;
#endif
case BPF_FUNC_get_func_ip:
- if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI)
+ if (is_kprobe_multi(prog))
return &bpf_get_func_ip_proto_kprobe_multi;
- if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
+ if (is_uprobe_multi(prog))
return &bpf_get_func_ip_proto_uprobe_multi;
return &bpf_get_func_ip_proto_kprobe;
case BPF_FUNC_get_attach_cookie:
- if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI)
+ if (is_kprobe_multi(prog))
return &bpf_get_attach_cookie_proto_kmulti;
- if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
+ if (is_uprobe_multi(prog))
return &bpf_get_attach_cookie_proto_umulti;
return &bpf_get_attach_cookie_proto_trace;
default:
@@ -2008,6 +1971,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_stackid_proto_raw_tp;
case BPF_FUNC_get_stack:
return &bpf_get_stack_proto_raw_tp;
+ case BPF_FUNC_get_attach_cookie:
+ return &bpf_get_attach_cookie_proto_tracing;
default:
return bpf_tracing_func_proto(func_id, prog);
}
@@ -2070,6 +2035,9 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_func_arg_cnt:
return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL;
case BPF_FUNC_get_attach_cookie:
+ if (prog->type == BPF_PROG_TYPE_TRACING &&
+ prog->expected_attach_type == BPF_TRACE_RAW_TP)
+ return &bpf_get_attach_cookie_proto_tracing;
return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL;
default:
fn = raw_tp_prog_func_proto(func_id, prog);
@@ -2278,6 +2246,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
{
struct bpf_prog_array *old_array;
struct bpf_prog_array *new_array;
+ struct bpf_prog *prog = NULL;
int ret;
mutex_lock(&bpf_event_mutex);
@@ -2286,9 +2255,10 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
goto unlock;
old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
+ if (!old_array)
+ goto put;
+
ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array);
- if (ret == -ENOENT)
- goto unlock;
if (ret < 0) {
bpf_prog_array_delete_safe(old_array, event->prog);
} else {
@@ -2296,11 +2266,23 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
bpf_prog_array_free_sleepable(old_array);
}
- bpf_prog_put(event->prog);
+put:
+ prog = event->prog;
event->prog = NULL;
unlock:
mutex_unlock(&bpf_event_mutex);
+
+ if (prog) {
+ /*
+ * It could be that the bpf_prog is not sleepable (and will be freed
+ * via normal RCU), but is called from a point that supports sleepable
+ * programs and uses tasks-trace-RCU.
+ */
+ synchronize_rcu_tasks_trace();
+
+ bpf_prog_put(prog);
+ }
}
int perf_event_query_prog_array(struct perf_event *event, void __user *info)
@@ -2370,16 +2352,26 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
}
static __always_inline
-void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
+void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
{
+ struct bpf_prog *prog = link->link.prog;
+ struct bpf_run_ctx *old_run_ctx;
+ struct bpf_trace_run_ctx run_ctx;
+
cant_sleep();
if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
bpf_prog_inc_misses_counter(prog);
goto out;
}
+
+ run_ctx.bpf_cookie = link->cookie;
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+
rcu_read_lock();
(void) bpf_prog_run(prog, args);
rcu_read_unlock();
+
+ bpf_reset_run_ctx(old_run_ctx);
out:
this_cpu_dec(*(prog->active));
}
@@ -2408,12 +2400,12 @@ out:
#define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
#define BPF_TRACE_DEFN_x(x) \
- void bpf_trace_run##x(struct bpf_prog *prog, \
+ void bpf_trace_run##x(struct bpf_raw_tp_link *link, \
REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \
{ \
u64 args[x]; \
REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \
- __bpf_trace_run(prog, args); \
+ __bpf_trace_run(link, args); \
} \
EXPORT_SYMBOL_GPL(bpf_trace_run##x)
BPF_TRACE_DEFN_x(1);
@@ -2429,9 +2421,10 @@ BPF_TRACE_DEFN_x(10);
BPF_TRACE_DEFN_x(11);
BPF_TRACE_DEFN_x(12);
-static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
+int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link)
{
struct tracepoint *tp = btp->tp;
+ struct bpf_prog *prog = link->link.prog;
/*
* check that program doesn't access arguments beyond what's
@@ -2443,18 +2436,12 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *
if (prog->aux->max_tp_access > btp->writable_size)
return -EINVAL;
- return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func,
- prog);
+ return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, link);
}
-int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
+int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link)
{
- return __bpf_probe_register(btp, prog);
-}
-
-int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
-{
- return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog);
+ return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, link);
}
int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
@@ -2577,6 +2564,12 @@ static int __init bpf_event_init(void)
fs_initcall(bpf_event_init);
#endif /* CONFIG_MODULES */
+struct bpf_session_run_ctx {
+ struct bpf_run_ctx run_ctx;
+ bool is_return;
+ void *data;
+};
+
#ifdef CONFIG_FPROBE
struct bpf_kprobe_multi_link {
struct bpf_link link;
@@ -2590,7 +2583,7 @@ struct bpf_kprobe_multi_link {
};
struct bpf_kprobe_multi_run_ctx {
- struct bpf_run_ctx run_ctx;
+ struct bpf_session_run_ctx session_ctx;
struct bpf_kprobe_multi_link *link;
unsigned long entry_ip;
};
@@ -2600,6 +2593,20 @@ struct user_syms {
char *buf;
};
+#ifndef CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS
+static DEFINE_PER_CPU(struct pt_regs, bpf_kprobe_multi_pt_regs);
+#define bpf_kprobe_multi_pt_regs_ptr() this_cpu_ptr(&bpf_kprobe_multi_pt_regs)
+#else
+#define bpf_kprobe_multi_pt_regs_ptr() (NULL)
+#endif
+
+static unsigned long ftrace_get_entry_ip(unsigned long fentry_ip)
+{
+ unsigned long ip = ftrace_get_symaddr(fentry_ip);
+
+ return ip ? : fentry_ip;
+}
+
static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 cnt)
{
unsigned long __user usymbol;
@@ -2679,6 +2686,7 @@ static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link)
static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
struct bpf_link_info *info)
{
+ u64 __user *ucookies = u64_to_user_ptr(info->kprobe_multi.cookies);
u64 __user *uaddrs = u64_to_user_ptr(info->kprobe_multi.addrs);
struct bpf_kprobe_multi_link *kmulti_link;
u32 ucount = info->kprobe_multi.count;
@@ -2686,6 +2694,8 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
if (!uaddrs ^ !ucount)
return -EINVAL;
+ if (ucookies && !ucount)
+ return -EINVAL;
kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
info->kprobe_multi.count = kmulti_link->cnt;
@@ -2699,6 +2709,18 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
else
ucount = kmulti_link->cnt;
+ if (ucookies) {
+ if (kmulti_link->cookies) {
+ if (copy_to_user(ucookies, kmulti_link->cookies, ucount * sizeof(u64)))
+ return -EFAULT;
+ } else {
+ for (i = 0; i < ucount; i++) {
+ if (put_user(0, ucookies + i))
+ return -EFAULT;
+ }
+ }
+ }
+
if (kallsyms_show_value(current_cred())) {
if (copy_to_user(uaddrs, kmulti_link->addrs, ucount * sizeof(u64)))
return -EFAULT;
@@ -2713,7 +2735,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_kprobe_multi_link_lops = {
.release = bpf_kprobe_multi_link_release,
- .dealloc = bpf_kprobe_multi_link_dealloc,
+ .dealloc_deferred = bpf_kprobe_multi_link_dealloc,
.fill_link_info = bpf_kprobe_multi_link_fill_link_info,
};
@@ -2754,7 +2776,8 @@ static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
if (WARN_ON_ONCE(!ctx))
return 0;
- run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx);
+ run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx,
+ session_ctx.run_ctx);
link = run_ctx->link;
if (!link->cookies)
return 0;
@@ -2771,30 +2794,38 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
{
struct bpf_kprobe_multi_run_ctx *run_ctx;
- run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx);
+ run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx,
+ session_ctx.run_ctx);
return run_ctx->entry_ip;
}
static int
kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
- unsigned long entry_ip, struct pt_regs *regs)
+ unsigned long entry_ip, struct ftrace_regs *fregs,
+ bool is_return, void *data)
{
struct bpf_kprobe_multi_run_ctx run_ctx = {
+ .session_ctx = {
+ .is_return = is_return,
+ .data = data,
+ },
.link = link,
.entry_ip = entry_ip,
};
struct bpf_run_ctx *old_run_ctx;
+ struct pt_regs *regs;
int err;
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
bpf_prog_inc_misses_counter(link->link.prog);
- err = 0;
+ err = 1;
goto out;
}
migrate_disable();
rcu_read_lock();
- old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+ regs = ftrace_partial_regs(fregs, bpf_kprobe_multi_pt_regs_ptr());
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx);
err = bpf_prog_run(link->link.prog, regs);
bpf_reset_run_ctx(old_run_ctx);
rcu_read_unlock();
@@ -2807,25 +2838,28 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
static int
kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *data)
{
struct bpf_kprobe_multi_link *link;
+ int err;
link = container_of(fp, struct bpf_kprobe_multi_link, fp);
- kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs);
- return 0;
+ err = kprobe_multi_link_prog_run(link, ftrace_get_entry_ip(fentry_ip),
+ fregs, false, data);
+ return is_kprobe_session(link->link.prog) ? err : 0;
}
static void
kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *data)
{
struct bpf_kprobe_multi_link *link;
link = container_of(fp, struct bpf_kprobe_multi_link, fp);
- kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs);
+ kprobe_multi_link_prog_run(link, ftrace_get_entry_ip(fentry_ip),
+ fregs, true, data);
}
static int symbols_cmp_r(const void *a, const void *b, const void *priv)
@@ -2958,7 +2992,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (sizeof(u64) != sizeof(void *))
return -EOPNOTSUPP;
- if (prog->expected_attach_type != BPF_TRACE_KPROBE_MULTI)
+ if (!is_kprobe_multi(prog))
return -EINVAL;
flags = attr->link_create.kprobe_multi.flags;
@@ -3039,10 +3073,12 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (err)
goto error;
- if (flags & BPF_F_KPROBE_MULTI_RETURN)
- link->fp.exit_handler = kprobe_multi_link_exit_handler;
- else
+ if (!(flags & BPF_F_KPROBE_MULTI_RETURN))
link->fp.entry_handler = kprobe_multi_link_handler;
+ if ((flags & BPF_F_KPROBE_MULTI_RETURN) || is_kprobe_session(prog))
+ link->fp.exit_handler = kprobe_multi_link_exit_handler;
+ if (is_kprobe_session(prog))
+ link->fp.entry_data_size = sizeof(u64);
link->addrs = addrs;
link->cookies = cookies;
@@ -3107,7 +3143,9 @@ struct bpf_uprobe {
loff_t offset;
unsigned long ref_ctr_offset;
u64 cookie;
+ struct uprobe *uprobe;
struct uprobe_consumer consumer;
+ bool session;
};
struct bpf_uprobe_multi_link {
@@ -3120,20 +3158,20 @@ struct bpf_uprobe_multi_link {
};
struct bpf_uprobe_multi_run_ctx {
- struct bpf_run_ctx run_ctx;
+ struct bpf_session_run_ctx session_ctx;
unsigned long entry_ip;
struct bpf_uprobe *uprobe;
};
-static void bpf_uprobe_unregister(struct path *path, struct bpf_uprobe *uprobes,
- u32 cnt)
+static void bpf_uprobe_unregister(struct bpf_uprobe *uprobes, u32 cnt)
{
u32 i;
- for (i = 0; i < cnt; i++) {
- uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset,
- &uprobes[i].consumer);
- }
+ for (i = 0; i < cnt; i++)
+ uprobe_unregister_nosync(uprobes[i].uprobe, &uprobes[i].consumer);
+
+ if (cnt)
+ uprobe_unregister_sync();
}
static void bpf_uprobe_multi_link_release(struct bpf_link *link)
@@ -3141,7 +3179,10 @@ static void bpf_uprobe_multi_link_release(struct bpf_link *link)
struct bpf_uprobe_multi_link *umulti_link;
umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
- bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt);
+ bpf_uprobe_unregister(umulti_link->uprobes, umulti_link->cnt);
+ if (umulti_link->task)
+ put_task_struct(umulti_link->task);
+ path_put(&umulti_link->path);
}
static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link)
@@ -3149,9 +3190,6 @@ static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link)
struct bpf_uprobe_multi_link *umulti_link;
umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
- if (umulti_link->task)
- put_task_struct(umulti_link->task);
- path_put(&umulti_link->path);
kvfree(umulti_link->uprobes);
kfree(umulti_link);
}
@@ -3167,7 +3205,8 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
struct bpf_uprobe_multi_link *umulti_link;
u32 ucount = info->uprobe_multi.count;
int err = 0, i;
- long left;
+ char *p, *buf;
+ long left = 0;
if (!upath ^ !upath_size)
return -EINVAL;
@@ -3181,26 +3220,23 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
info->uprobe_multi.pid = umulti_link->task ?
task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0;
- if (upath) {
- char *p, *buf;
-
- upath_size = min_t(u32, upath_size, PATH_MAX);
-
- buf = kmalloc(upath_size, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
- p = d_path(&umulti_link->path, buf, upath_size);
- if (IS_ERR(p)) {
- kfree(buf);
- return PTR_ERR(p);
- }
- upath_size = buf + upath_size - p;
- left = copy_to_user(upath, p, upath_size);
+ upath_size = upath_size ? min_t(u32, upath_size, PATH_MAX) : PATH_MAX;
+ buf = kmalloc(upath_size, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ p = d_path(&umulti_link->path, buf, upath_size);
+ if (IS_ERR(p)) {
kfree(buf);
- if (left)
- return -EFAULT;
- info->uprobe_multi.path_size = upath_size;
+ return PTR_ERR(p);
}
+ upath_size = buf + upath_size - p;
+
+ if (upath)
+ left = copy_to_user(upath, p, upath_size);
+ kfree(buf);
+ if (left)
+ return -EFAULT;
+ info->uprobe_multi.path_size = upath_size;
if (!uoffsets && !ucookies && !uref_ctr_offsets)
return 0;
@@ -3227,25 +3263,30 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_uprobe_multi_link_lops = {
.release = bpf_uprobe_multi_link_release,
- .dealloc = bpf_uprobe_multi_link_dealloc,
+ .dealloc_deferred = bpf_uprobe_multi_link_dealloc,
.fill_link_info = bpf_uprobe_multi_link_fill_link_info,
};
static int uprobe_prog_run(struct bpf_uprobe *uprobe,
unsigned long entry_ip,
- struct pt_regs *regs)
+ struct pt_regs *regs,
+ bool is_return, void *data)
{
struct bpf_uprobe_multi_link *link = uprobe->link;
struct bpf_uprobe_multi_run_ctx run_ctx = {
+ .session_ctx = {
+ .is_return = is_return,
+ .data = data,
+ },
.entry_ip = entry_ip,
.uprobe = uprobe,
};
struct bpf_prog *prog = link->link.prog;
- bool sleepable = prog->aux->sleepable;
+ bool sleepable = prog->sleepable;
struct bpf_run_ctx *old_run_ctx;
- int err = 0;
+ int err;
- if (link->task && current != link->task)
+ if (link->task && !same_thread_group(current, link->task))
return 0;
if (sleepable)
@@ -3255,7 +3296,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe,
migrate_disable();
- old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx);
err = bpf_prog_run(link->link.prog, regs);
bpf_reset_run_ctx(old_run_ctx);
@@ -3269,8 +3310,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe,
}
static bool
-uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx ctx,
- struct mm_struct *mm)
+uprobe_multi_link_filter(struct uprobe_consumer *con, struct mm_struct *mm)
{
struct bpf_uprobe *uprobe;
@@ -3279,28 +3319,36 @@ uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx ctx
}
static int
-uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs)
+uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs,
+ __u64 *data)
{
struct bpf_uprobe *uprobe;
+ int ret;
uprobe = container_of(con, struct bpf_uprobe, consumer);
- return uprobe_prog_run(uprobe, instruction_pointer(regs), regs);
+ ret = uprobe_prog_run(uprobe, instruction_pointer(regs), regs, false, data);
+ if (uprobe->session)
+ return ret ? UPROBE_HANDLER_IGNORE : 0;
+ return 0;
}
static int
-uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs)
+uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs,
+ __u64 *data)
{
struct bpf_uprobe *uprobe;
uprobe = container_of(con, struct bpf_uprobe, consumer);
- return uprobe_prog_run(uprobe, func, regs);
+ uprobe_prog_run(uprobe, func, regs, true, data);
+ return 0;
}
static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
{
struct bpf_uprobe_multi_run_ctx *run_ctx;
- run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx);
+ run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx,
+ session_ctx.run_ctx);
return run_ctx->entry_ip;
}
@@ -3308,7 +3356,8 @@ static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx)
{
struct bpf_uprobe_multi_run_ctx *run_ctx;
- run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx);
+ run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx,
+ session_ctx.run_ctx);
return run_ctx->uprobe->cookie;
}
@@ -3332,7 +3381,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (sizeof(u64) != sizeof(void *))
return -EOPNOTSUPP;
- if (prog->expected_attach_type != BPF_TRACE_UPROBE_MULTI)
+ if (!is_uprobe_multi(prog))
return -EINVAL;
flags = attr->link_create.uprobe_multi.flags;
@@ -3346,8 +3395,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path);
uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets);
cnt = attr->link_create.uprobe_multi.cnt;
+ pid = attr->link_create.uprobe_multi.pid;
- if (!upath || !uoffsets || !cnt)
+ if (!upath || !uoffsets || !cnt || pid < 0)
return -EINVAL;
if (cnt > MAX_UPROBE_MULTI_CNT)
return -E2BIG;
@@ -3371,11 +3421,8 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
goto error_path_put;
}
- pid = attr->link_create.uprobe_multi.pid;
if (pid) {
- rcu_read_lock();
- task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
- rcu_read_unlock();
+ task = get_pid_task(find_vpid(pid), PIDTYPE_TGID);
if (!task) {
err = -ESRCH;
goto error_path_put;
@@ -3410,11 +3457,12 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
uprobes[i].link = link;
- if (flags & BPF_F_UPROBE_MULTI_RETURN)
- uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler;
- else
+ if (!(flags & BPF_F_UPROBE_MULTI_RETURN))
uprobes[i].consumer.handler = uprobe_multi_link_handler;
-
+ if (flags & BPF_F_UPROBE_MULTI_RETURN || is_uprobe_session(prog))
+ uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler;
+ if (is_uprobe_session(prog))
+ uprobes[i].session = true;
if (pid)
uprobes[i].consumer.filter = uprobe_multi_link_filter;
}
@@ -3429,22 +3477,26 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
&bpf_uprobe_multi_link_lops, prog);
for (i = 0; i < cnt; i++) {
- err = uprobe_register_refctr(d_real_inode(link->path.dentry),
- uprobes[i].offset,
- uprobes[i].ref_ctr_offset,
- &uprobes[i].consumer);
- if (err) {
- bpf_uprobe_unregister(&path, uprobes, i);
- goto error_free;
+ uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry),
+ uprobes[i].offset,
+ uprobes[i].ref_ctr_offset,
+ &uprobes[i].consumer);
+ if (IS_ERR(uprobes[i].uprobe)) {
+ err = PTR_ERR(uprobes[i].uprobe);
+ link->cnt = i;
+ goto error_unregister;
}
}
err = bpf_link_prime(&link->link, &link_primer);
if (err)
- goto error_free;
+ goto error_unregister;
return bpf_link_settle(&link_primer);
+error_unregister:
+ bpf_uprobe_unregister(uprobes, link->cnt);
+
error_free:
kvfree(uprobes);
kfree(link);
@@ -3468,3 +3520,65 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
return 0;
}
#endif /* CONFIG_UPROBES */
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc bool bpf_session_is_return(void)
+{
+ struct bpf_session_run_ctx *session_ctx;
+
+ session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx);
+ return session_ctx->is_return;
+}
+
+__bpf_kfunc __u64 *bpf_session_cookie(void)
+{
+ struct bpf_session_run_ctx *session_ctx;
+
+ session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx);
+ return session_ctx->data;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(kprobe_multi_kfunc_set_ids)
+BTF_ID_FLAGS(func, bpf_session_is_return)
+BTF_ID_FLAGS(func, bpf_session_cookie)
+BTF_KFUNCS_END(kprobe_multi_kfunc_set_ids)
+
+static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id)
+{
+ if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id))
+ return 0;
+
+ if (!is_kprobe_session(prog) && !is_uprobe_session(prog))
+ return -EACCES;
+
+ return 0;
+}
+
+static const struct btf_kfunc_id_set bpf_kprobe_multi_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &kprobe_multi_kfunc_set_ids,
+ .filter = bpf_kprobe_multi_filter,
+};
+
+static int __init bpf_kprobe_multi_kfuncs_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kprobe_multi_kfunc_set);
+}
+
+late_initcall(bpf_kprobe_multi_kfuncs_init);
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid_type type,
+ u64 value)
+{
+ if (type != PIDTYPE_PID && type != PIDTYPE_TGID)
+ return -EINVAL;
+
+ return bpf_send_signal_common(sig, type, task, value);
+}
+
+__bpf_kfunc_end_defs();
diff --git a/kernel/trace/bpf_trace.h b/kernel/trace/bpf_trace.h
index 9acbc11ac7bb..c4075b56becc 100644
--- a/kernel/trace/bpf_trace.h
+++ b/kernel/trace/bpf_trace.h
@@ -19,7 +19,7 @@ TRACE_EVENT(bpf_trace_printk,
),
TP_fast_assign(
- __assign_str(bpf_string, bpf_string);
+ __assign_str(bpf_string);
),
TP_printk("%s", __get_str(bpf_string))
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index c83c005e654e..5dddfc2149f6 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -7,9 +7,11 @@
*
* Highly modified by Steven Rostedt (VMware).
*/
+#include <linux/bits.h>
#include <linux/jump_label.h>
#include <linux/suspend.h>
#include <linux/ftrace.h>
+#include <linux/static_call.h>
#include <linux/slab.h>
#include <trace/events/sched.h>
@@ -17,19 +19,487 @@
#include "ftrace_internal.h"
#include "trace.h"
-#ifdef CONFIG_DYNAMIC_FTRACE
-#define ASSIGN_OPS_HASH(opsname, val) \
- .func_hash = val, \
- .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
-#else
-#define ASSIGN_OPS_HASH(opsname, val)
-#endif
+/*
+ * FGRAPH_FRAME_SIZE: Size in bytes of the meta data on the shadow stack
+ * FGRAPH_FRAME_OFFSET: Size in long words of the meta data frame
+ */
+#define FGRAPH_FRAME_SIZE sizeof(struct ftrace_ret_stack)
+#define FGRAPH_FRAME_OFFSET DIV_ROUND_UP(FGRAPH_FRAME_SIZE, sizeof(long))
+
+/*
+ * On entry to a function (via function_graph_enter()), a new fgraph frame
+ * (ftrace_ret_stack) is pushed onto the stack as well as a word that
+ * holds a bitmask and a type (called "bitmap"). The bitmap is defined as:
+ *
+ * bits: 0 - 9 offset in words from the previous ftrace_ret_stack
+ *
+ * bits: 10 - 11 Type of storage
+ * 0 - reserved
+ * 1 - bitmap of fgraph_array index
+ * 2 - reserved data
+ *
+ * For type with "bitmap of fgraph_array index" (FGRAPH_TYPE_BITMAP):
+ * bits: 12 - 27 The bitmap of fgraph_ops fgraph_array index
+ * That is, it's a bitmask of 0-15 (16 bits)
+ * where if a corresponding ops in the fgraph_array[]
+ * expects a callback from the return of the function
+ * it's corresponding bit will be set.
+ *
+ *
+ * The top of the ret_stack (when not empty) will always have a reference
+ * word that points to the last fgraph frame that was saved.
+ *
+ * For reserved data:
+ * bits: 12 - 17 The size in words that is stored
+ * bits: 18 - 23 The index of fgraph_array, which shows who is stored
+ *
+ * That is, at the end of function_graph_enter, if the first and forth
+ * fgraph_ops on the fgraph_array[] (index 0 and 3) needs their retfunc called
+ * on the return of the function being traced, and the forth fgraph_ops
+ * stored two words of data, this is what will be on the task's shadow
+ * ret_stack: (the stack grows upward)
+ *
+ * ret_stack[SHADOW_STACK_OFFSET]
+ * | SHADOW_STACK_TASK_VARS(ret_stack)[15] |
+ * ...
+ * | SHADOW_STACK_TASK_VARS(ret_stack)[0] |
+ * ret_stack[SHADOW_STACK_MAX_OFFSET]
+ * ...
+ * | | <- task->curr_ret_stack
+ * +--------------------------------------------+
+ * | (3 << 12) | (3 << 10) | FGRAPH_FRAME_OFFSET|
+ * | *or put another way* |
+ * | (3 << FGRAPH_DATA_INDEX_SHIFT)| \ | This is for fgraph_ops[3].
+ * | ((2 - 1) << FGRAPH_DATA_SHIFT)| \ | The data size is 2 words.
+ * | (FGRAPH_TYPE_DATA << FGRAPH_TYPE_SHIFT)| \ |
+ * | (offset2:FGRAPH_FRAME_OFFSET+3) | <- the offset2 is from here
+ * +--------------------------------------------+ ( It is 4 words from the ret_stack)
+ * | STORED DATA WORD 2 |
+ * | STORED DATA WORD 1 |
+ * +--------------------------------------------+
+ * | (9 << 12) | (1 << 10) | FGRAPH_FRAME_OFFSET|
+ * | *or put another way* |
+ * | (BIT(3)|BIT(0)) << FGRAPH_INDEX_SHIFT | \ |
+ * | FGRAPH_TYPE_BITMAP << FGRAPH_TYPE_SHIFT| \ |
+ * | (offset1:FGRAPH_FRAME_OFFSET) | <- the offset1 is from here
+ * +--------------------------------------------+
+ * | struct ftrace_ret_stack |
+ * | (stores the saved ret pointer) | <- the offset points here
+ * +--------------------------------------------+
+ * | (X) | (N) | ( N words away from
+ * | | previous ret_stack)
+ * ...
+ * ret_stack[0]
+ *
+ * If a backtrace is required, and the real return pointer needs to be
+ * fetched, then it looks at the task's curr_ret_stack offset, if it
+ * is greater than zero (reserved, or right before popped), it would mask
+ * the value by FGRAPH_FRAME_OFFSET_MASK to get the offset of the
+ * ftrace_ret_stack structure stored on the shadow stack.
+ */
+
+/*
+ * The following is for the top word on the stack:
+ *
+ * FGRAPH_FRAME_OFFSET (0-9) holds the offset delta to the fgraph frame
+ * FGRAPH_TYPE (10-11) holds the type of word this is.
+ * (RESERVED or BITMAP)
+ */
+#define FGRAPH_FRAME_OFFSET_BITS 10
+#define FGRAPH_FRAME_OFFSET_MASK GENMASK(FGRAPH_FRAME_OFFSET_BITS - 1, 0)
+
+#define FGRAPH_TYPE_BITS 2
+#define FGRAPH_TYPE_MASK GENMASK(FGRAPH_TYPE_BITS - 1, 0)
+#define FGRAPH_TYPE_SHIFT FGRAPH_FRAME_OFFSET_BITS
+
+enum {
+ FGRAPH_TYPE_RESERVED = 0,
+ FGRAPH_TYPE_BITMAP = 1,
+ FGRAPH_TYPE_DATA = 2,
+};
+
+/*
+ * For BITMAP type:
+ * FGRAPH_INDEX (12-27) bits holding the gops index wanting return callback called
+ */
+#define FGRAPH_INDEX_BITS 16
+#define FGRAPH_INDEX_MASK GENMASK(FGRAPH_INDEX_BITS - 1, 0)
+#define FGRAPH_INDEX_SHIFT (FGRAPH_TYPE_SHIFT + FGRAPH_TYPE_BITS)
+
+/*
+ * For DATA type:
+ * FGRAPH_DATA (12-17) bits hold the size of data (in words)
+ * FGRAPH_INDEX (18-23) bits hold the index for which gops->idx the data is for
+ *
+ * Note:
+ * data_size == 0 means 1 word, and 31 (=2^5 - 1) means 32 words.
+ */
+#define FGRAPH_DATA_BITS 5
+#define FGRAPH_DATA_MASK GENMASK(FGRAPH_DATA_BITS - 1, 0)
+#define FGRAPH_DATA_SHIFT (FGRAPH_TYPE_SHIFT + FGRAPH_TYPE_BITS)
+#define FGRAPH_MAX_DATA_SIZE (sizeof(long) * (1 << FGRAPH_DATA_BITS))
+
+#define FGRAPH_DATA_INDEX_BITS 4
+#define FGRAPH_DATA_INDEX_MASK GENMASK(FGRAPH_DATA_INDEX_BITS - 1, 0)
+#define FGRAPH_DATA_INDEX_SHIFT (FGRAPH_DATA_SHIFT + FGRAPH_DATA_BITS)
+
+#define FGRAPH_MAX_INDEX \
+ ((FGRAPH_INDEX_SIZE << FGRAPH_DATA_BITS) + FGRAPH_RET_INDEX)
+
+#define FGRAPH_ARRAY_SIZE FGRAPH_INDEX_BITS
+
+/*
+ * SHADOW_STACK_SIZE: The size in bytes of the entire shadow stack
+ * SHADOW_STACK_OFFSET: The size in long words of the shadow stack
+ * SHADOW_STACK_MAX_OFFSET: The max offset of the stack for a new frame to be added
+ */
+#define SHADOW_STACK_SIZE (4096)
+#define SHADOW_STACK_OFFSET (SHADOW_STACK_SIZE / sizeof(long))
+/* Leave on a buffer at the end */
+#define SHADOW_STACK_MAX_OFFSET \
+ (SHADOW_STACK_OFFSET - (FGRAPH_FRAME_OFFSET + 1 + FGRAPH_ARRAY_SIZE))
+
+/* RET_STACK(): Return the frame from a given @offset from task @t */
+#define RET_STACK(t, offset) ((struct ftrace_ret_stack *)(&(t)->ret_stack[offset]))
+
+/*
+ * Each fgraph_ops has a reservered unsigned long at the end (top) of the
+ * ret_stack to store task specific state.
+ */
+#define SHADOW_STACK_TASK_VARS(ret_stack) \
+ ((unsigned long *)(&(ret_stack)[SHADOW_STACK_OFFSET - FGRAPH_ARRAY_SIZE]))
DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph);
int ftrace_graph_active;
+static struct kmem_cache *fgraph_stack_cachep;
+
+static struct fgraph_ops *fgraph_array[FGRAPH_ARRAY_SIZE];
+static unsigned long fgraph_array_bitmask;
+
+/* LRU index table for fgraph_array */
+static int fgraph_lru_table[FGRAPH_ARRAY_SIZE];
+static int fgraph_lru_next;
+static int fgraph_lru_last;
+
+/* Initialize fgraph_lru_table with unused index */
+static void fgraph_lru_init(void)
+{
+ int i;
+
+ for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
+ fgraph_lru_table[i] = i;
+}
+
+/* Release the used index to the LRU table */
+static int fgraph_lru_release_index(int idx)
+{
+ if (idx < 0 || idx >= FGRAPH_ARRAY_SIZE ||
+ WARN_ON_ONCE(fgraph_lru_table[fgraph_lru_last] != -1))
+ return -1;
+
+ fgraph_lru_table[fgraph_lru_last] = idx;
+ fgraph_lru_last = (fgraph_lru_last + 1) % FGRAPH_ARRAY_SIZE;
+
+ clear_bit(idx, &fgraph_array_bitmask);
+ return 0;
+}
+
+/* Allocate a new index from LRU table */
+static int fgraph_lru_alloc_index(void)
+{
+ int idx = fgraph_lru_table[fgraph_lru_next];
+
+ /* No id is available */
+ if (idx == -1)
+ return -1;
+
+ fgraph_lru_table[fgraph_lru_next] = -1;
+ fgraph_lru_next = (fgraph_lru_next + 1) % FGRAPH_ARRAY_SIZE;
+
+ set_bit(idx, &fgraph_array_bitmask);
+ return idx;
+}
+
+/* Get the offset to the fgraph frame from a ret_stack value */
+static inline int __get_offset(unsigned long val)
+{
+ return val & FGRAPH_FRAME_OFFSET_MASK;
+}
+
+/* Get the type of word from a ret_stack value */
+static inline int __get_type(unsigned long val)
+{
+ return (val >> FGRAPH_TYPE_SHIFT) & FGRAPH_TYPE_MASK;
+}
+
+/* Get the data_index for a DATA type ret_stack word */
+static inline int __get_data_index(unsigned long val)
+{
+ return (val >> FGRAPH_DATA_INDEX_SHIFT) & FGRAPH_DATA_INDEX_MASK;
+}
+
+/* Get the data_size for a DATA type ret_stack word */
+static inline int __get_data_size(unsigned long val)
+{
+ return ((val >> FGRAPH_DATA_SHIFT) & FGRAPH_DATA_MASK) + 1;
+}
+
+/* Get the word from the ret_stack at @offset */
+static inline unsigned long get_fgraph_entry(struct task_struct *t, int offset)
+{
+ return t->ret_stack[offset];
+}
+
+/* Get the FRAME_OFFSET from the word from the @offset on ret_stack */
+static inline int get_frame_offset(struct task_struct *t, int offset)
+{
+ return __get_offset(t->ret_stack[offset]);
+}
+
+/* For BITMAP type: get the bitmask from the @offset at ret_stack */
+static inline unsigned long
+get_bitmap_bits(struct task_struct *t, int offset)
+{
+ return (t->ret_stack[offset] >> FGRAPH_INDEX_SHIFT) & FGRAPH_INDEX_MASK;
+}
+
+/* Write the bitmap to the ret_stack at @offset (does index, offset and bitmask) */
+static inline void
+set_bitmap(struct task_struct *t, int offset, unsigned long bitmap)
+{
+ t->ret_stack[offset] = (bitmap << FGRAPH_INDEX_SHIFT) |
+ (FGRAPH_TYPE_BITMAP << FGRAPH_TYPE_SHIFT) | FGRAPH_FRAME_OFFSET;
+}
+
+/* For DATA type: get the data saved under the ret_stack word at @offset */
+static inline void *get_data_type_data(struct task_struct *t, int offset)
+{
+ unsigned long val = t->ret_stack[offset];
+
+ if (__get_type(val) != FGRAPH_TYPE_DATA)
+ return NULL;
+ offset -= __get_data_size(val);
+ return (void *)&t->ret_stack[offset];
+}
+
+/* Create the ret_stack word for a DATA type */
+static inline unsigned long make_data_type_val(int idx, int size, int offset)
+{
+ return (idx << FGRAPH_DATA_INDEX_SHIFT) |
+ ((size - 1) << FGRAPH_DATA_SHIFT) |
+ (FGRAPH_TYPE_DATA << FGRAPH_TYPE_SHIFT) | offset;
+}
+
+/* ftrace_graph_entry set to this to tell some archs to run function graph */
+static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops,
+ struct ftrace_regs *fregs)
+{
+ return 0;
+}
+
+/* ftrace_graph_return set to this to tell some archs to run function graph */
+static void return_run(struct ftrace_graph_ret *trace, struct fgraph_ops *ops,
+ struct ftrace_regs *fregs)
+{
+}
+
+static void ret_stack_set_task_var(struct task_struct *t, int idx, long val)
+{
+ unsigned long *gvals = SHADOW_STACK_TASK_VARS(t->ret_stack);
+
+ gvals[idx] = val;
+}
+
+static unsigned long *
+ret_stack_get_task_var(struct task_struct *t, int idx)
+{
+ unsigned long *gvals = SHADOW_STACK_TASK_VARS(t->ret_stack);
+
+ return &gvals[idx];
+}
+
+static void ret_stack_init_task_vars(unsigned long *ret_stack)
+{
+ unsigned long *gvals = SHADOW_STACK_TASK_VARS(ret_stack);
+
+ memset(gvals, 0, sizeof(*gvals) * FGRAPH_ARRAY_SIZE);
+}
+
+/**
+ * fgraph_reserve_data - Reserve storage on the task's ret_stack
+ * @idx: The index of fgraph_array
+ * @size_bytes: The size in bytes to reserve
+ *
+ * Reserves space of up to FGRAPH_MAX_DATA_SIZE bytes on the
+ * task's ret_stack shadow stack, for a given fgraph_ops during
+ * the entryfunc() call. If entryfunc() returns zero, the storage
+ * is discarded. An entryfunc() can only call this once per iteration.
+ * The fgraph_ops retfunc() can retrieve this stored data with
+ * fgraph_retrieve_data().
+ *
+ * Returns: On success, a pointer to the data on the stack.
+ * Otherwise, NULL if there's not enough space left on the
+ * ret_stack for the data, or if fgraph_reserve_data() was called
+ * more than once for a single entryfunc() call.
+ */
+void *fgraph_reserve_data(int idx, int size_bytes)
+{
+ unsigned long val;
+ void *data;
+ int curr_ret_stack = current->curr_ret_stack;
+ int data_size;
+
+ if (size_bytes > FGRAPH_MAX_DATA_SIZE)
+ return NULL;
+
+ /* Convert the data size to number of longs. */
+ data_size = (size_bytes + sizeof(long) - 1) >> (sizeof(long) == 4 ? 2 : 3);
+
+ val = get_fgraph_entry(current, curr_ret_stack - 1);
+ data = &current->ret_stack[curr_ret_stack];
+
+ curr_ret_stack += data_size + 1;
+ if (unlikely(curr_ret_stack >= SHADOW_STACK_MAX_OFFSET))
+ return NULL;
+
+ val = make_data_type_val(idx, data_size, __get_offset(val) + data_size + 1);
+
+ /* Set the last word to be reserved */
+ current->ret_stack[curr_ret_stack - 1] = val;
+
+ /* Make sure interrupts see this */
+ barrier();
+ current->curr_ret_stack = curr_ret_stack;
+ /* Again sync with interrupts, and reset reserve */
+ current->ret_stack[curr_ret_stack - 1] = val;
+
+ return data;
+}
+
+/**
+ * fgraph_retrieve_data - Retrieve stored data from fgraph_reserve_data()
+ * @idx: the index of fgraph_array (fgraph_ops::idx)
+ * @size_bytes: pointer to retrieved data size.
+ *
+ * This is to be called by a fgraph_ops retfunc(), to retrieve data that
+ * was stored by the fgraph_ops entryfunc() on the function entry.
+ * That is, this will retrieve the data that was reserved on the
+ * entry of the function that corresponds to the exit of the function
+ * that the fgraph_ops retfunc() is called on.
+ *
+ * Returns: The stored data from fgraph_reserve_data() called by the
+ * matching entryfunc() for the retfunc() this is called from.
+ * Or NULL if there was nothing stored.
+ */
+void *fgraph_retrieve_data(int idx, int *size_bytes)
+{
+ return fgraph_retrieve_parent_data(idx, size_bytes, 0);
+}
+
+/**
+ * fgraph_get_task_var - retrieve a task specific state variable
+ * @gops: The ftrace_ops that owns the task specific variable
+ *
+ * Every registered fgraph_ops has a task state variable
+ * reserved on the task's ret_stack. This function returns the
+ * address to that variable.
+ *
+ * Returns the address to the fgraph_ops @gops tasks specific
+ * unsigned long variable.
+ */
+unsigned long *fgraph_get_task_var(struct fgraph_ops *gops)
+{
+ return ret_stack_get_task_var(current, gops->idx);
+}
+
+/*
+ * @offset: The offset into @t->ret_stack to find the ret_stack entry
+ * @frame_offset: Where to place the offset into @t->ret_stack of that entry
+ *
+ * Returns a pointer to the previous ret_stack below @offset or NULL
+ * when it reaches the bottom of the stack.
+ *
+ * Calling this with:
+ *
+ * offset = task->curr_ret_stack;
+ * do {
+ * ret_stack = get_ret_stack(task, offset, &offset);
+ * } while (ret_stack);
+ *
+ * Will iterate through all the ret_stack entries from curr_ret_stack
+ * down to the first one.
+ */
+static inline struct ftrace_ret_stack *
+get_ret_stack(struct task_struct *t, int offset, int *frame_offset)
+{
+ int offs;
+
+ BUILD_BUG_ON(FGRAPH_FRAME_SIZE % sizeof(long));
+
+ if (unlikely(offset <= 0))
+ return NULL;
+
+ offs = get_frame_offset(t, --offset);
+ if (WARN_ON_ONCE(offs <= 0 || offs > offset))
+ return NULL;
+
+ offset -= offs;
+
+ *frame_offset = offset;
+ return RET_STACK(t, offset);
+}
+
+/**
+ * fgraph_retrieve_parent_data - get data from a parent function
+ * @idx: The index into the fgraph_array (fgraph_ops::idx)
+ * @size_bytes: A pointer to retrieved data size
+ * @depth: The depth to find the parent (0 is the current function)
+ *
+ * This is similar to fgraph_retrieve_data() but can be used to retrieve
+ * data from a parent caller function.
+ *
+ * Return: a pointer to the specified parent data or NULL if not found
+ */
+void *fgraph_retrieve_parent_data(int idx, int *size_bytes, int depth)
+{
+ struct ftrace_ret_stack *ret_stack = NULL;
+ int offset = current->curr_ret_stack;
+ unsigned long val;
+
+ if (offset <= 0)
+ return NULL;
+
+ for (;;) {
+ int next_offset;
+
+ ret_stack = get_ret_stack(current, offset, &next_offset);
+ if (!ret_stack || --depth < 0)
+ break;
+ offset = next_offset;
+ }
+
+ if (!ret_stack)
+ return NULL;
+
+ offset--;
+
+ val = get_fgraph_entry(current, offset);
+ while (__get_type(val) == FGRAPH_TYPE_DATA) {
+ if (__get_data_index(val) == idx)
+ goto found;
+ offset -= __get_data_size(val) + 1;
+ val = get_fgraph_entry(current, offset);
+ }
+ return NULL;
+found:
+ if (size_bytes)
+ *size_bytes = __get_data_size(val) * sizeof(long);
+ return get_data_type_data(current, offset);
+}
+
/* Both enabled by default (can be cleared by function_graph tracer flags */
-static bool fgraph_sleep_time = true;
+bool fgraph_sleep_time = true;
#ifdef CONFIG_DYNAMIC_FTRACE
/*
@@ -51,6 +521,29 @@ int __weak ftrace_disable_ftrace_graph_caller(void)
}
#endif
+int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
+{
+ return 0;
+}
+
+static void ftrace_graph_ret_stub(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
+{
+}
+
+static struct fgraph_ops fgraph_stub = {
+ .entryfunc = ftrace_graph_entry_stub,
+ .retfunc = ftrace_graph_ret_stub,
+};
+
+static struct fgraph_ops *fgraph_direct_gops = &fgraph_stub;
+DEFINE_STATIC_CALL(fgraph_func, ftrace_graph_entry_stub);
+DEFINE_STATIC_CALL(fgraph_retfunc, ftrace_graph_ret_stub);
+static DEFINE_STATIC_KEY_TRUE(fgraph_do_direct);
+
/**
* ftrace_graph_stop - set to permanently disable function graph tracing
*
@@ -67,10 +560,12 @@ void ftrace_graph_stop(void)
/* Add a function return address to the trace stack on thread info.*/
static int
ftrace_push_return_trace(unsigned long ret, unsigned long func,
- unsigned long frame_pointer, unsigned long *retp)
+ unsigned long frame_pointer, unsigned long *retp,
+ int fgraph_idx)
{
- unsigned long long calltime;
- int index;
+ struct ftrace_ret_stack *ret_stack;
+ unsigned long val;
+ int offset;
if (unlikely(ftrace_graph_is_dead()))
return -EBUSY;
@@ -78,32 +573,64 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
if (!current->ret_stack)
return -EBUSY;
+ BUILD_BUG_ON(SHADOW_STACK_SIZE % sizeof(long));
+
+ /* Set val to "reserved" with the delta to the new fgraph frame */
+ val = (FGRAPH_TYPE_RESERVED << FGRAPH_TYPE_SHIFT) | FGRAPH_FRAME_OFFSET;
+
/*
* We must make sure the ret_stack is tested before we read
* anything else.
*/
smp_rmb();
- /* The return trace stack is full */
- if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
+ /*
+ * Check if there's room on the shadow stack to fit a fraph frame
+ * and a bitmap word.
+ */
+ if (current->curr_ret_stack + FGRAPH_FRAME_OFFSET + 1 >= SHADOW_STACK_MAX_OFFSET) {
atomic_inc(&current->trace_overrun);
return -EBUSY;
}
- calltime = trace_clock_local();
+ offset = READ_ONCE(current->curr_ret_stack);
+ ret_stack = RET_STACK(current, offset);
+ offset += FGRAPH_FRAME_OFFSET;
- index = ++current->curr_ret_stack;
+ /* ret offset = FGRAPH_FRAME_OFFSET ; type = reserved */
+ current->ret_stack[offset] = val;
+ ret_stack->ret = ret;
+ /*
+ * The unwinders expect curr_ret_stack to point to either zero
+ * or an offset where to find the next ret_stack. Even though the
+ * ret stack might be bogus, we want to write the ret and the
+ * offset to find the ret_stack before we increment the stack point.
+ * If an interrupt comes in now before we increment the curr_ret_stack
+ * it may blow away what we wrote. But that's fine, because the
+ * offset will still be correct (even though the 'ret' won't be).
+ * What we worry about is the offset being correct after we increment
+ * the curr_ret_stack and before we update that offset, as if an
+ * interrupt comes in and does an unwind stack dump, it will need
+ * at least a correct offset!
+ */
+ barrier();
+ WRITE_ONCE(current->curr_ret_stack, offset + 1);
+ /*
+ * This next barrier is to ensure that an interrupt coming in
+ * will not corrupt what we are about to write.
+ */
barrier();
- current->ret_stack[index].ret = ret;
- current->ret_stack[index].func = func;
- current->ret_stack[index].calltime = calltime;
+
+ /* Still keep it reserved even if an interrupt came in */
+ current->ret_stack[offset] = val;
+
+ ret_stack->ret = ret;
+ ret_stack->func = func;
#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
- current->ret_stack[index].fp = frame_pointer;
+ ret_stack->fp = frame_pointer;
#endif
-#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
- current->ret_stack[index].retp = retp;
-#endif
- return 0;
+ ret_stack->retp = retp;
+ return offset;
}
/*
@@ -120,55 +647,92 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
# define MCOUNT_INSN_SIZE 0
#endif
-int function_graph_enter(unsigned long ret, unsigned long func,
- unsigned long frame_pointer, unsigned long *retp)
+/* If the caller does not use ftrace, call this function. */
+int function_graph_enter_regs(unsigned long ret, unsigned long func,
+ unsigned long frame_pointer, unsigned long *retp,
+ struct ftrace_regs *fregs)
{
struct ftrace_graph_ent trace;
+ unsigned long bitmap = 0;
+ int offset;
+ int bit;
+ int i;
-#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
- /*
- * Skip graph tracing if the return location is served by direct trampoline,
- * since call sequence and return addresses are unpredictable anyway.
- * Ex: BPF trampoline may call original function and may skip frame
- * depending on type of BPF programs attached.
- */
- if (ftrace_direct_func_count &&
- ftrace_find_rec_direct(ret - MCOUNT_INSN_SIZE))
+ bit = ftrace_test_recursion_trylock(func, ret);
+ if (bit < 0)
return -EBUSY;
-#endif
+
trace.func = func;
trace.depth = ++current->curr_ret_depth;
- if (ftrace_push_return_trace(ret, func, frame_pointer, retp))
+ offset = ftrace_push_return_trace(ret, func, frame_pointer, retp, 0);
+ if (offset < 0)
goto out;
- /* Only trace if the calling function expects to */
- if (!ftrace_graph_entry(&trace))
+#ifdef CONFIG_HAVE_STATIC_CALL
+ if (static_branch_likely(&fgraph_do_direct)) {
+ int save_curr_ret_stack = current->curr_ret_stack;
+
+ if (static_call(fgraph_func)(&trace, fgraph_direct_gops, fregs))
+ bitmap |= BIT(fgraph_direct_gops->idx);
+ else
+ /* Clear out any saved storage */
+ current->curr_ret_stack = save_curr_ret_stack;
+ } else
+#endif
+ {
+ for_each_set_bit(i, &fgraph_array_bitmask,
+ sizeof(fgraph_array_bitmask) * BITS_PER_BYTE) {
+ struct fgraph_ops *gops = READ_ONCE(fgraph_array[i]);
+ int save_curr_ret_stack;
+
+ if (gops == &fgraph_stub)
+ continue;
+
+ save_curr_ret_stack = current->curr_ret_stack;
+ if (ftrace_ops_test(&gops->ops, func, NULL) &&
+ gops->entryfunc(&trace, gops, fregs))
+ bitmap |= BIT(i);
+ else
+ /* Clear out any saved storage */
+ current->curr_ret_stack = save_curr_ret_stack;
+ }
+ }
+
+ if (!bitmap)
goto out_ret;
+ /*
+ * Since this function uses fgraph_idx = 0 as a tail-call checking
+ * flag, set that bit always.
+ */
+ set_bitmap(current, offset, bitmap | BIT(0));
+ ftrace_test_recursion_unlock(bit);
return 0;
out_ret:
- current->curr_ret_stack--;
+ current->curr_ret_stack -= FGRAPH_FRAME_OFFSET + 1;
out:
current->curr_ret_depth--;
+ ftrace_test_recursion_unlock(bit);
return -EBUSY;
}
/* Retrieve a function return address to the trace stack on thread info.*/
-static void
+static struct ftrace_ret_stack *
ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
- unsigned long frame_pointer)
+ unsigned long frame_pointer, int *offset)
{
- int index;
+ struct ftrace_ret_stack *ret_stack;
- index = current->curr_ret_stack;
+ ret_stack = get_ret_stack(current, current->curr_ret_stack, offset);
- if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) {
+ if (unlikely(!ret_stack)) {
ftrace_graph_stop();
- WARN_ON(1);
+ WARN(1, "Bad function graph ret_stack pointer: %d",
+ current->curr_ret_stack);
/* Might as well panic, otherwise we have no where to go */
*ret = (unsigned long)panic;
- return;
+ return NULL;
}
#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
@@ -186,30 +750,32 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
* Note, -mfentry does not use frame pointers, and this test
* is not needed if CC_USING_FENTRY is set.
*/
- if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
+ if (unlikely(ret_stack->fp != frame_pointer)) {
ftrace_graph_stop();
WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
" from func %ps return to %lx\n",
- current->ret_stack[index].fp,
+ ret_stack->fp,
frame_pointer,
- (void *)current->ret_stack[index].func,
- current->ret_stack[index].ret);
+ (void *)ret_stack->func,
+ ret_stack->ret);
*ret = (unsigned long)panic;
- return;
+ return NULL;
}
#endif
- *ret = current->ret_stack[index].ret;
- trace->func = current->ret_stack[index].func;
- trace->calltime = current->ret_stack[index].calltime;
+ *offset += FGRAPH_FRAME_OFFSET;
+ *ret = ret_stack->ret;
+ trace->func = ret_stack->func;
trace->overrun = atomic_read(&current->trace_overrun);
- trace->depth = current->curr_ret_depth--;
+ trace->depth = current->curr_ret_depth;
/*
* We still want to trace interrupts coming in if
* max_depth is set to 1. Make sure the decrement is
* seen before ftrace_graph_return.
*/
barrier();
+
+ return ret_stack;
}
/*
@@ -237,52 +803,76 @@ static struct notifier_block ftrace_suspend_notifier = {
.notifier_call = ftrace_suspend_notifier_call,
};
-/* fgraph_ret_regs is not defined without CONFIG_FUNCTION_GRAPH_RETVAL */
-struct fgraph_ret_regs;
-
/*
* Send the trace to the ring-buffer.
* @return the original return address.
*/
-static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs,
- unsigned long frame_pointer)
+static inline unsigned long
+__ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointer)
{
+ struct ftrace_ret_stack *ret_stack;
struct ftrace_graph_ret trace;
+ unsigned long bitmap;
unsigned long ret;
+ int offset;
+ int i;
+
+ ret_stack = ftrace_pop_return_trace(&trace, &ret, frame_pointer, &offset);
+
+ if (unlikely(!ret_stack)) {
+ ftrace_graph_stop();
+ WARN_ON(1);
+ /* Might as well panic. What else to do? */
+ return (unsigned long)panic;
+ }
+
+ if (fregs)
+ ftrace_regs_set_instruction_pointer(fregs, ret);
- ftrace_pop_return_trace(&trace, &ret, frame_pointer);
#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
- trace.retval = fgraph_ret_regs_return_value(ret_regs);
+ trace.retval = ftrace_regs_get_return_value(fregs);
#endif
- trace.rettime = trace_clock_local();
- ftrace_graph_return(&trace);
+
+ bitmap = get_bitmap_bits(current, offset);
+
+#ifdef CONFIG_HAVE_STATIC_CALL
+ if (static_branch_likely(&fgraph_do_direct)) {
+ if (test_bit(fgraph_direct_gops->idx, &bitmap))
+ static_call(fgraph_retfunc)(&trace, fgraph_direct_gops, fregs);
+ } else
+#endif
+ {
+ for_each_set_bit(i, &bitmap, sizeof(bitmap) * BITS_PER_BYTE) {
+ struct fgraph_ops *gops = READ_ONCE(fgraph_array[i]);
+
+ if (gops == &fgraph_stub)
+ continue;
+
+ gops->retfunc(&trace, gops, fregs);
+ }
+ }
+
/*
* The ftrace_graph_return() may still access the current
* ret_stack structure, we need to make sure the update of
* curr_ret_stack is after that.
*/
barrier();
- current->curr_ret_stack--;
-
- if (unlikely(!ret)) {
- ftrace_graph_stop();
- WARN_ON(1);
- /* Might as well panic. What else to do? */
- ret = (unsigned long)panic;
- }
+ current->curr_ret_stack = offset - FGRAPH_FRAME_OFFSET;
+ current->curr_ret_depth--;
return ret;
}
/*
- * After all architecures have selected HAVE_FUNCTION_GRAPH_RETVAL, we can
- * leave only ftrace_return_to_handler(ret_regs).
+ * After all architecures have selected HAVE_FUNCTION_GRAPH_FREGS, we can
+ * leave only ftrace_return_to_handler(fregs).
*/
-#ifdef CONFIG_HAVE_FUNCTION_GRAPH_RETVAL
-unsigned long ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs)
+#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FREGS
+unsigned long ftrace_return_to_handler(struct ftrace_regs *fregs)
{
- return __ftrace_return_to_handler(ret_regs,
- fgraph_ret_regs_frame_pointer(ret_regs));
+ return __ftrace_return_to_handler(fregs,
+ ftrace_regs_get_frame_pointer(fregs));
}
#else
unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
@@ -293,7 +883,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
/**
* ftrace_graph_get_ret_stack - return the entry of the shadow stack
- * @task: The task to read the shadow stack from
+ * @task: The task to read the shadow stack from.
* @idx: Index down the shadow stack
*
* Return the ret_struct on the shadow stack of the @task at the
@@ -305,115 +895,151 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
struct ftrace_ret_stack *
ftrace_graph_get_ret_stack(struct task_struct *task, int idx)
{
- idx = task->curr_ret_stack - idx;
+ struct ftrace_ret_stack *ret_stack = NULL;
+ int offset = task->curr_ret_stack;
- if (idx >= 0 && idx <= task->curr_ret_stack)
- return &task->ret_stack[idx];
+ if (offset < 0)
+ return NULL;
- return NULL;
+ do {
+ ret_stack = get_ret_stack(task, offset, &offset);
+ } while (ret_stack && --idx >= 0);
+
+ return ret_stack;
}
/**
- * ftrace_graph_ret_addr - convert a potentially modified stack return address
- * to its original value
+ * ftrace_graph_top_ret_addr - return the top return address in the shadow stack
+ * @task: The task to read the shadow stack from.
+ *
+ * Return the first return address on the shadow stack of the @task, which is
+ * not the fgraph's return_to_handler.
+ */
+unsigned long ftrace_graph_top_ret_addr(struct task_struct *task)
+{
+ unsigned long return_handler = (unsigned long)dereference_kernel_function_descriptor(return_to_handler);
+ struct ftrace_ret_stack *ret_stack = NULL;
+ int offset = task->curr_ret_stack;
+
+ if (offset < 0)
+ return 0;
+
+ do {
+ ret_stack = get_ret_stack(task, offset, &offset);
+ } while (ret_stack && ret_stack->ret == return_handler);
+
+ return ret_stack ? ret_stack->ret : 0;
+}
+
+/**
+ * ftrace_graph_ret_addr - return the original value of the return address
+ * @task: The task the unwinder is being executed on
+ * @idx: An initialized pointer to the next stack index to use
+ * @ret: The current return address (likely pointing to return_handler)
+ * @retp: The address on the stack of the current return location
*
* This function can be called by stack unwinding code to convert a found stack
- * return address ('ret') to its original value, in case the function graph
+ * return address (@ret) to its original value, in case the function graph
* tracer has modified it to be 'return_to_handler'. If the address hasn't
- * been modified, the unchanged value of 'ret' is returned.
+ * been modified, the unchanged value of @ret is returned.
*
- * 'idx' is a state variable which should be initialized by the caller to zero
- * before the first call.
+ * @idx holds the last index used to know where to start from. It should be
+ * initialized to zero for the first iteration as that will mean to start
+ * at the top of the shadow stack. If the location is found, this pointer
+ * will be assigned that location so that if called again, it will continue
+ * where it left off.
*
- * 'retp' is a pointer to the return address on the stack. It's ignored if
- * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined.
+ * @retp is a pointer to the return address on the stack.
*/
-#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
unsigned long ret, unsigned long *retp)
{
- int index = task->curr_ret_stack;
+ struct ftrace_ret_stack *ret_stack;
+ unsigned long return_handler = (unsigned long)dereference_kernel_function_descriptor(return_to_handler);
int i;
- if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler))
+ if (ret != return_handler)
return ret;
- if (index < 0)
+ if (!idx)
return ret;
- for (i = 0; i <= index; i++)
- if (task->ret_stack[i].retp == retp)
- return task->ret_stack[i].ret;
+ i = *idx ? : task->curr_ret_stack;
+ while (i > 0) {
+ ret_stack = get_ret_stack(task, i, &i);
+ if (!ret_stack)
+ break;
+ /*
+ * For the tail-call, there would be 2 or more ftrace_ret_stacks on
+ * the ret_stack, which records "return_to_handler" as the return
+ * address except for the last one.
+ * But on the real stack, there should be 1 entry because tail-call
+ * reuses the return address on the stack and jump to the next function.
+ * Thus we will continue to find real return address.
+ */
+ if (ret_stack->retp == retp &&
+ ret_stack->ret != return_handler) {
+ *idx = i;
+ return ret_stack->ret;
+ }
+ }
return ret;
}
-#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
-unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
- unsigned long ret, unsigned long *retp)
-{
- int task_idx;
-
- if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler))
- return ret;
-
- task_idx = task->curr_ret_stack;
-
- if (!task->ret_stack || task_idx < *idx)
- return ret;
-
- task_idx -= *idx;
- (*idx)++;
-
- return task->ret_stack[task_idx].ret;
-}
-#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
static struct ftrace_ops graph_ops = {
.func = ftrace_graph_func,
- .flags = FTRACE_OPS_FL_INITIALIZED |
- FTRACE_OPS_FL_PID |
- FTRACE_OPS_GRAPH_STUB,
+ .flags = FTRACE_OPS_GRAPH_STUB,
#ifdef FTRACE_GRAPH_TRAMP_ADDR
.trampoline = FTRACE_GRAPH_TRAMP_ADDR,
/* trampoline_size is only needed for dynamically allocated tramps */
#endif
- ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
};
-void ftrace_graph_sleep_time_control(bool enable)
+void fgraph_init_ops(struct ftrace_ops *dst_ops,
+ struct ftrace_ops *src_ops)
{
- fgraph_sleep_time = enable;
+ dst_ops->flags = FTRACE_OPS_FL_PID | FTRACE_OPS_GRAPH_STUB;
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+ if (src_ops) {
+ dst_ops->func_hash = &src_ops->local_hash;
+ mutex_init(&dst_ops->local_hash.regex_lock);
+ INIT_LIST_HEAD(&dst_ops->subop_list);
+ dst_ops->flags |= FTRACE_OPS_FL_INITIALIZED;
+ }
+#endif
}
-int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
+void ftrace_graph_sleep_time_control(bool enable)
{
- return 0;
+ fgraph_sleep_time = enable;
}
/*
* Simply points to ftrace_stub, but with the proper protocol.
* Defined by the linker script in linux/vmlinux.lds.h
*/
-extern void ftrace_stub_graph(struct ftrace_graph_ret *);
+void ftrace_stub_graph(struct ftrace_graph_ret *trace, struct fgraph_ops *gops,
+ struct ftrace_regs *fregs);
/* The callbacks that hook a function */
trace_func_graph_ret_t ftrace_graph_return = ftrace_stub_graph;
trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
-static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub;
/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
-static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
+static int alloc_retstack_tasklist(unsigned long **ret_stack_list)
{
int i;
int ret = 0;
int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
struct task_struct *g, *t;
+ if (WARN_ON_ONCE(!fgraph_stack_cachep))
+ return -ENOMEM;
+
for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
- ret_stack_list[i] =
- kmalloc_array(FTRACE_RETFUNC_DEPTH,
- sizeof(struct ftrace_ret_stack),
- GFP_KERNEL);
+ ret_stack_list[i] = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL);
if (!ret_stack_list[i]) {
start = 0;
end = i;
@@ -431,9 +1057,10 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
if (t->ret_stack == NULL) {
atomic_set(&t->trace_overrun, 0);
- t->curr_ret_stack = -1;
+ ret_stack_init_task_vars(ret_stack_list[start]);
+ t->curr_ret_stack = 0;
t->curr_ret_depth = -1;
- /* Make sure the tasks see the -1 first: */
+ /* Make sure the tasks see the 0 first: */
smp_wmb();
t->ret_stack = ret_stack_list[start++];
}
@@ -443,7 +1070,7 @@ unlock:
rcu_read_unlock();
free:
for (i = start; i < end; i++)
- kfree(ret_stack_list[i]);
+ kmem_cache_free(fgraph_stack_cachep, ret_stack_list[i]);
return ret;
}
@@ -454,7 +1081,6 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
unsigned int prev_state)
{
unsigned long long timestamp;
- int index;
/*
* Does the user want to count the time a function was asleep.
@@ -471,63 +1097,19 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
if (!next->ftrace_timestamp)
return;
- /*
- * Update all the counters in next to make up for the
- * time next was sleeping.
- */
- timestamp -= next->ftrace_timestamp;
-
- for (index = next->curr_ret_stack; index >= 0; index--)
- next->ret_stack[index].calltime += timestamp;
-}
-
-static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
-{
- if (!ftrace_ops_test(&global_ops, trace->func, NULL))
- return 0;
- return __ftrace_graph_entry(trace);
-}
-
-/*
- * The function graph tracer should only trace the functions defined
- * by set_ftrace_filter and set_ftrace_notrace. If another function
- * tracer ops is registered, the graph tracer requires testing the
- * function against the global ops, and not just trace any function
- * that any ftrace_ops registered.
- */
-void update_function_graph_func(void)
-{
- struct ftrace_ops *op;
- bool do_test = false;
-
- /*
- * The graph and global ops share the same set of functions
- * to test. If any other ops is on the list, then
- * the graph tracing needs to test if its the function
- * it should call.
- */
- do_for_each_ftrace_op(op, ftrace_ops_list) {
- if (op != &global_ops && op != &graph_ops &&
- op != &ftrace_list_end) {
- do_test = true;
- /* in double loop, break out with goto */
- goto out;
- }
- } while_for_each_ftrace_op(op);
- out:
- if (do_test)
- ftrace_graph_entry = ftrace_graph_entry_test;
- else
- ftrace_graph_entry = __ftrace_graph_entry;
+ next->ftrace_sleeptime += timestamp - next->ftrace_timestamp;
}
-static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
+static DEFINE_PER_CPU(unsigned long *, idle_ret_stack);
static void
-graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
+graph_init_task(struct task_struct *t, unsigned long *ret_stack)
{
atomic_set(&t->trace_overrun, 0);
+ ret_stack_init_task_vars(ret_stack);
t->ftrace_timestamp = 0;
+ t->curr_ret_stack = 0;
+ t->curr_ret_depth = -1;
/* make curr_ret_stack visible before we add the ret_stack */
smp_wmb();
t->ret_stack = ret_stack;
@@ -539,7 +1121,7 @@ graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
*/
void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
{
- t->curr_ret_stack = -1;
+ t->curr_ret_stack = 0;
t->curr_ret_depth = -1;
/*
* The idle task has no parent, it either has its own
@@ -549,14 +1131,14 @@ void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
if (ftrace_graph_active) {
- struct ftrace_ret_stack *ret_stack;
+ unsigned long *ret_stack;
+
+ if (WARN_ON_ONCE(!fgraph_stack_cachep))
+ return;
ret_stack = per_cpu(idle_ret_stack, cpu);
if (!ret_stack) {
- ret_stack =
- kmalloc_array(FTRACE_RETFUNC_DEPTH,
- sizeof(struct ftrace_ret_stack),
- GFP_KERNEL);
+ ret_stack = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL);
if (!ret_stack)
return;
per_cpu(idle_ret_stack, cpu) = ret_stack;
@@ -570,15 +1152,16 @@ void ftrace_graph_init_task(struct task_struct *t)
{
/* Make sure we do not use the parent ret_stack */
t->ret_stack = NULL;
- t->curr_ret_stack = -1;
+ t->curr_ret_stack = 0;
t->curr_ret_depth = -1;
if (ftrace_graph_active) {
- struct ftrace_ret_stack *ret_stack;
+ unsigned long *ret_stack;
- ret_stack = kmalloc_array(FTRACE_RETFUNC_DEPTH,
- sizeof(struct ftrace_ret_stack),
- GFP_KERNEL);
+ if (WARN_ON_ONCE(!fgraph_stack_cachep))
+ return;
+
+ ret_stack = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL);
if (!ret_stack)
return;
graph_init_task(t, ret_stack);
@@ -587,24 +1170,67 @@ void ftrace_graph_init_task(struct task_struct *t)
void ftrace_graph_exit_task(struct task_struct *t)
{
- struct ftrace_ret_stack *ret_stack = t->ret_stack;
+ unsigned long *ret_stack = t->ret_stack;
t->ret_stack = NULL;
/* NULL must become visible to IRQs before we free it: */
barrier();
- kfree(ret_stack);
+ if (ret_stack) {
+ if (WARN_ON_ONCE(!fgraph_stack_cachep))
+ return;
+ kmem_cache_free(fgraph_stack_cachep, ret_stack);
+ }
}
+#ifdef CONFIG_DYNAMIC_FTRACE
+static int fgraph_pid_func(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
+{
+ struct trace_array *tr = gops->ops.private;
+ int pid;
+
+ if (tr) {
+ pid = this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid);
+ if (pid == FTRACE_PID_IGNORE)
+ return 0;
+ if (pid != FTRACE_PID_TRACE &&
+ pid != current->pid)
+ return 0;
+ }
+
+ return gops->saved_func(trace, gops, fregs);
+}
+
+void fgraph_update_pid_func(void)
+{
+ struct fgraph_ops *gops;
+ struct ftrace_ops *op;
+
+ if (!(graph_ops.flags & FTRACE_OPS_FL_INITIALIZED))
+ return;
+
+ list_for_each_entry(op, &graph_ops.subop_list, list) {
+ if (op->flags & FTRACE_OPS_FL_PID) {
+ gops = container_of(op, struct fgraph_ops, ops);
+ gops->entryfunc = ftrace_pids_enabled(op) ?
+ fgraph_pid_func : gops->saved_func;
+ if (ftrace_graph_active == 1)
+ static_call_update(fgraph_func, gops->entryfunc);
+ }
+ }
+}
+#endif
+
/* Allocate a return stack for each task */
static int start_graph_tracing(void)
{
- struct ftrace_ret_stack **ret_stack_list;
+ unsigned long **ret_stack_list;
int ret, cpu;
- ret_stack_list = kmalloc_array(FTRACE_RETSTACK_ALLOC_SIZE,
- sizeof(struct ftrace_ret_stack *),
- GFP_KERNEL);
+ ret_stack_list = kcalloc(FTRACE_RETSTACK_ALLOC_SIZE,
+ sizeof(*ret_stack_list), GFP_KERNEL);
if (!ret_stack_list)
return -ENOMEM;
@@ -630,60 +1256,180 @@ static int start_graph_tracing(void)
return ret;
}
+static void init_task_vars(int idx)
+{
+ struct task_struct *g, *t;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ if (idle_task(cpu)->ret_stack)
+ ret_stack_set_task_var(idle_task(cpu), idx, 0);
+ }
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, t) {
+ if (t->ret_stack)
+ ret_stack_set_task_var(t, idx, 0);
+ }
+ read_unlock(&tasklist_lock);
+}
+
+static void ftrace_graph_enable_direct(bool enable_branch, struct fgraph_ops *gops)
+{
+ trace_func_graph_ent_t func = NULL;
+ trace_func_graph_ret_t retfunc = NULL;
+ int i;
+
+ if (gops) {
+ func = gops->entryfunc;
+ retfunc = gops->retfunc;
+ fgraph_direct_gops = gops;
+ } else {
+ for_each_set_bit(i, &fgraph_array_bitmask,
+ sizeof(fgraph_array_bitmask) * BITS_PER_BYTE) {
+ func = fgraph_array[i]->entryfunc;
+ retfunc = fgraph_array[i]->retfunc;
+ fgraph_direct_gops = fgraph_array[i];
+ }
+ }
+ if (WARN_ON_ONCE(!func))
+ return;
+
+ static_call_update(fgraph_func, func);
+ static_call_update(fgraph_retfunc, retfunc);
+ if (enable_branch)
+ static_branch_disable(&fgraph_do_direct);
+}
+
+static void ftrace_graph_disable_direct(bool disable_branch)
+{
+ if (disable_branch)
+ static_branch_disable(&fgraph_do_direct);
+ static_call_update(fgraph_func, ftrace_graph_entry_stub);
+ static_call_update(fgraph_retfunc, ftrace_graph_ret_stub);
+ fgraph_direct_gops = &fgraph_stub;
+}
+
+/* The cpu_boot init_task->ret_stack will never be freed */
+static int fgraph_cpu_init(unsigned int cpu)
+{
+ if (!idle_task(cpu)->ret_stack)
+ ftrace_graph_init_idle_task(idle_task(cpu), cpu);
+ return 0;
+}
+
int register_ftrace_graph(struct fgraph_ops *gops)
{
+ static bool fgraph_initialized;
+ int command = 0;
int ret = 0;
+ int i = -1;
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
- /* we currently allow only one tracer registered at a time */
- if (ftrace_graph_active) {
- ret = -EBUSY;
- goto out;
+ if (!fgraph_stack_cachep) {
+ fgraph_stack_cachep = kmem_cache_create("fgraph_stack",
+ SHADOW_STACK_SIZE,
+ SHADOW_STACK_SIZE, 0, NULL);
+ if (!fgraph_stack_cachep)
+ return -ENOMEM;
}
- register_pm_notifier(&ftrace_suspend_notifier);
+ if (!fgraph_initialized) {
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "fgraph:online",
+ fgraph_cpu_init, NULL);
+ if (ret < 0) {
+ pr_warn("fgraph: Error to init cpu hotplug support\n");
+ return ret;
+ }
+ fgraph_initialized = true;
+ ret = 0;
+ }
- ftrace_graph_active++;
- ret = start_graph_tracing();
- if (ret) {
- ftrace_graph_active--;
- goto out;
+ if (!fgraph_array[0]) {
+ /* The array must always have real data on it */
+ for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
+ fgraph_array[i] = &fgraph_stub;
+ fgraph_lru_init();
}
- ftrace_graph_return = gops->retfunc;
+ i = fgraph_lru_alloc_index();
+ if (i < 0 || WARN_ON_ONCE(fgraph_array[i] != &fgraph_stub))
+ return -ENOSPC;
+ gops->idx = i;
- /*
- * Update the indirect function to the entryfunc, and the
- * function that gets called to the entry_test first. Then
- * call the update fgraph entry function to determine if
- * the entryfunc should be called directly or not.
- */
- __ftrace_graph_entry = gops->entryfunc;
- ftrace_graph_entry = ftrace_graph_entry_test;
- update_function_graph_func();
+ ftrace_graph_active++;
+
+ if (ftrace_graph_active == 2)
+ ftrace_graph_disable_direct(true);
+
+ if (ftrace_graph_active == 1) {
+ ftrace_graph_enable_direct(false, gops);
+ register_pm_notifier(&ftrace_suspend_notifier);
+ ret = start_graph_tracing();
+ if (ret)
+ goto error;
+ /*
+ * Some archs just test to see if these are not
+ * the default function
+ */
+ ftrace_graph_return = return_run;
+ ftrace_graph_entry = entry_run;
+ command = FTRACE_START_FUNC_RET;
+ } else {
+ init_task_vars(gops->idx);
+ }
+ /* Always save the function, and reset at unregistering */
+ gops->saved_func = gops->entryfunc;
- ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
-out:
- mutex_unlock(&ftrace_lock);
+ ret = ftrace_startup_subops(&graph_ops, &gops->ops, command);
+ if (!ret)
+ fgraph_array[i] = gops;
+
+error:
+ if (ret) {
+ ftrace_graph_active--;
+ gops->saved_func = NULL;
+ fgraph_lru_release_index(i);
+ }
return ret;
}
void unregister_ftrace_graph(struct fgraph_ops *gops)
{
- mutex_lock(&ftrace_lock);
+ int command = 0;
+
+ guard(mutex)(&ftrace_lock);
if (unlikely(!ftrace_graph_active))
- goto out;
+ return;
+
+ if (unlikely(gops->idx < 0 || gops->idx >= FGRAPH_ARRAY_SIZE ||
+ fgraph_array[gops->idx] != gops))
+ return;
+
+ if (fgraph_lru_release_index(gops->idx) < 0)
+ return;
+
+ fgraph_array[gops->idx] = &fgraph_stub;
ftrace_graph_active--;
- ftrace_graph_return = ftrace_stub_graph;
- ftrace_graph_entry = ftrace_graph_entry_stub;
- __ftrace_graph_entry = ftrace_graph_entry_stub;
- ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET);
- unregister_pm_notifier(&ftrace_suspend_notifier);
- unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
- out:
- mutex_unlock(&ftrace_lock);
+ if (!ftrace_graph_active)
+ command = FTRACE_STOP_FUNC_RET;
+
+ ftrace_shutdown_subops(&graph_ops, &gops->ops, command);
+
+ if (ftrace_graph_active == 1)
+ ftrace_graph_enable_direct(true, NULL);
+ else if (!ftrace_graph_active)
+ ftrace_graph_disable_direct(false);
+
+ if (!ftrace_graph_active) {
+ ftrace_graph_return = ftrace_stub_graph;
+ ftrace_graph_entry = ftrace_graph_entry_stub;
+ unregister_pm_notifier(&ftrace_suspend_notifier);
+ unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
+ }
+ gops->saved_func = NULL;
}
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 9ff018245840..33082c4e8154 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -8,98 +8,224 @@
#include <linux/fprobe.h>
#include <linux/kallsyms.h>
#include <linux/kprobes.h>
-#include <linux/rethook.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/sort.h>
+#include <asm/fprobe.h>
+
#include "trace.h"
-struct fprobe_rethook_node {
- struct rethook_node node;
- unsigned long entry_ip;
- unsigned long entry_parent_ip;
- char data[];
-};
+#define FPROBE_IP_HASH_BITS 8
+#define FPROBE_IP_TABLE_SIZE (1 << FPROBE_IP_HASH_BITS)
-static inline void __fprobe_handler(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct ftrace_regs *fregs)
-{
- struct fprobe_rethook_node *fpr;
- struct rethook_node *rh = NULL;
- struct fprobe *fp;
- void *entry_data = NULL;
- int ret = 0;
+#define FPROBE_HASH_BITS 6
+#define FPROBE_TABLE_SIZE (1 << FPROBE_HASH_BITS)
- fp = container_of(ops, struct fprobe, ops);
+#define SIZE_IN_LONG(x) ((x + sizeof(long) - 1) >> (sizeof(long) == 8 ? 3 : 2))
- if (fp->exit_handler) {
- rh = rethook_try_get(fp->rethook);
- if (!rh) {
- fp->nmissed++;
- return;
- }
- fpr = container_of(rh, struct fprobe_rethook_node, node);
- fpr->entry_ip = ip;
- fpr->entry_parent_ip = parent_ip;
- if (fp->entry_data_size)
- entry_data = fpr->data;
+/*
+ * fprobe_table: hold 'fprobe_hlist::hlist' for checking the fprobe still
+ * exists. The key is the address of fprobe instance.
+ * fprobe_ip_table: hold 'fprobe_hlist::array[*]' for searching the fprobe
+ * instance related to the funciton address. The key is the ftrace IP
+ * address.
+ *
+ * When unregistering the fprobe, fprobe_hlist::fp and fprobe_hlist::array[*].fp
+ * are set NULL and delete those from both hash tables (by hlist_del_rcu).
+ * After an RCU grace period, the fprobe_hlist itself will be released.
+ *
+ * fprobe_table and fprobe_ip_table can be accessed from either
+ * - Normal hlist traversal and RCU add/del under 'fprobe_mutex' is held.
+ * - RCU hlist traversal under disabling preempt
+ */
+static struct hlist_head fprobe_table[FPROBE_TABLE_SIZE];
+static struct hlist_head fprobe_ip_table[FPROBE_IP_TABLE_SIZE];
+static DEFINE_MUTEX(fprobe_mutex);
+
+/*
+ * Find first fprobe in the hlist. It will be iterated twice in the entry
+ * probe, once for correcting the total required size, the second time is
+ * calling back the user handlers.
+ * Thus the hlist in the fprobe_table must be sorted and new probe needs to
+ * be added *before* the first fprobe.
+ */
+static struct fprobe_hlist_node *find_first_fprobe_node(unsigned long ip)
+{
+ struct fprobe_hlist_node *node;
+ struct hlist_head *head;
+
+ head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)];
+ hlist_for_each_entry_rcu(node, head, hlist,
+ lockdep_is_held(&fprobe_mutex)) {
+ if (node->addr == ip)
+ return node;
}
+ return NULL;
+}
+NOKPROBE_SYMBOL(find_first_fprobe_node);
+
+/* Node insertion and deletion requires the fprobe_mutex */
+static void insert_fprobe_node(struct fprobe_hlist_node *node)
+{
+ unsigned long ip = node->addr;
+ struct fprobe_hlist_node *next;
+ struct hlist_head *head;
- if (fp->entry_handler)
- ret = fp->entry_handler(fp, ip, parent_ip, ftrace_get_regs(fregs), entry_data);
+ lockdep_assert_held(&fprobe_mutex);
- /* If entry_handler returns !0, nmissed is not counted. */
- if (rh) {
- if (ret)
- rethook_recycle(rh);
- else
- rethook_hook(rh, ftrace_get_regs(fregs), true);
+ next = find_first_fprobe_node(ip);
+ if (next) {
+ hlist_add_before_rcu(&node->hlist, &next->hlist);
+ return;
}
+ head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)];
+ hlist_add_head_rcu(&node->hlist, head);
}
-static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct ftrace_regs *fregs)
+/* Return true if there are synonims */
+static bool delete_fprobe_node(struct fprobe_hlist_node *node)
{
- struct fprobe *fp;
- int bit;
+ lockdep_assert_held(&fprobe_mutex);
- fp = container_of(ops, struct fprobe, ops);
- if (fprobe_disabled(fp))
- return;
+ WRITE_ONCE(node->fp, NULL);
+ hlist_del_rcu(&node->hlist);
+ return !!find_first_fprobe_node(node->addr);
+}
- /* recursion detection has to go before any traceable function and
- * all functions before this point should be marked as notrace
- */
- bit = ftrace_test_recursion_trylock(ip, parent_ip);
- if (bit < 0) {
- fp->nmissed++;
- return;
+/* Check existence of the fprobe */
+static bool is_fprobe_still_exist(struct fprobe *fp)
+{
+ struct hlist_head *head;
+ struct fprobe_hlist *fph;
+
+ head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)];
+ hlist_for_each_entry_rcu(fph, head, hlist,
+ lockdep_is_held(&fprobe_mutex)) {
+ if (fph->fp == fp)
+ return true;
}
- __fprobe_handler(ip, parent_ip, ops, fregs);
- ftrace_test_recursion_unlock(bit);
+ return false;
+}
+NOKPROBE_SYMBOL(is_fprobe_still_exist);
+
+static int add_fprobe_hash(struct fprobe *fp)
+{
+ struct fprobe_hlist *fph = fp->hlist_array;
+ struct hlist_head *head;
+
+ lockdep_assert_held(&fprobe_mutex);
+
+ if (WARN_ON_ONCE(!fph))
+ return -EINVAL;
+
+ if (is_fprobe_still_exist(fp))
+ return -EEXIST;
+
+ head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)];
+ hlist_add_head_rcu(&fp->hlist_array->hlist, head);
+ return 0;
+}
+
+static int del_fprobe_hash(struct fprobe *fp)
+{
+ struct fprobe_hlist *fph = fp->hlist_array;
+
+ lockdep_assert_held(&fprobe_mutex);
+
+ if (WARN_ON_ONCE(!fph))
+ return -EINVAL;
+
+ if (!is_fprobe_still_exist(fp))
+ return -ENOENT;
+
+ fph->fp = NULL;
+ hlist_del_rcu(&fph->hlist);
+ return 0;
+}
+
+#ifdef ARCH_DEFINE_ENCODE_FPROBE_HEADER
+
+/* The arch should encode fprobe_header info into one unsigned long */
+#define FPROBE_HEADER_SIZE_IN_LONG 1
+
+static inline bool write_fprobe_header(unsigned long *stack,
+ struct fprobe *fp, unsigned int size_words)
+{
+ if (WARN_ON_ONCE(size_words > MAX_FPROBE_DATA_SIZE_WORD ||
+ !arch_fprobe_header_encodable(fp)))
+ return false;
+ *stack = arch_encode_fprobe_header(fp, size_words);
+ return true;
}
-NOKPROBE_SYMBOL(fprobe_handler);
-static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct ftrace_regs *fregs)
+static inline void read_fprobe_header(unsigned long *stack,
+ struct fprobe **fp, unsigned int *size_words)
{
+ *fp = arch_decode_fprobe_header_fp(*stack);
+ *size_words = arch_decode_fprobe_header_size(*stack);
+}
+
+#else
+
+/* Generic fprobe_header */
+struct __fprobe_header {
struct fprobe *fp;
- int bit;
+ unsigned long size_words;
+} __packed;
- fp = container_of(ops, struct fprobe, ops);
- if (fprobe_disabled(fp))
- return;
+#define FPROBE_HEADER_SIZE_IN_LONG SIZE_IN_LONG(sizeof(struct __fprobe_header))
- /* recursion detection has to go before any traceable function and
- * all functions called before this point should be marked as notrace
- */
- bit = ftrace_test_recursion_trylock(ip, parent_ip);
- if (bit < 0) {
- fp->nmissed++;
- return;
- }
+static inline bool write_fprobe_header(unsigned long *stack,
+ struct fprobe *fp, unsigned int size_words)
+{
+ struct __fprobe_header *fph = (struct __fprobe_header *)stack;
+
+ if (WARN_ON_ONCE(size_words > MAX_FPROBE_DATA_SIZE_WORD))
+ return false;
+
+ fph->fp = fp;
+ fph->size_words = size_words;
+ return true;
+}
+
+static inline void read_fprobe_header(unsigned long *stack,
+ struct fprobe **fp, unsigned int *size_words)
+{
+ struct __fprobe_header *fph = (struct __fprobe_header *)stack;
+
+ *fp = fph->fp;
+ *size_words = fph->size_words;
+}
+
+#endif
+
+/*
+ * fprobe shadow stack management:
+ * Since fprobe shares a single fgraph_ops, it needs to share the stack entry
+ * among the probes on the same function exit. Note that a new probe can be
+ * registered before a target function is returning, we can not use the hash
+ * table to find the corresponding probes. Thus the probe address is stored on
+ * the shadow stack with its entry data size.
+ *
+ */
+static inline int __fprobe_handler(unsigned long ip, unsigned long parent_ip,
+ struct fprobe *fp, struct ftrace_regs *fregs,
+ void *data)
+{
+ if (!fp->entry_handler)
+ return 0;
+ return fp->entry_handler(fp, ip, parent_ip, fregs, data);
+}
+
+static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
+ struct fprobe *fp, struct ftrace_regs *fregs,
+ void *data)
+{
+ int ret;
/*
* This user handler is shared with other kprobes and is not expected to be
* called recursively. So if any other kprobe handler is running, this will
@@ -108,44 +234,182 @@ static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
*/
if (unlikely(kprobe_running())) {
fp->nmissed++;
- goto recursion_unlock;
+ return 0;
}
kprobe_busy_begin();
- __fprobe_handler(ip, parent_ip, ops, fregs);
+ ret = __fprobe_handler(ip, parent_ip, fp, fregs, data);
kprobe_busy_end();
-
-recursion_unlock:
- ftrace_test_recursion_unlock(bit);
+ return ret;
}
-static void fprobe_exit_handler(struct rethook_node *rh, void *data,
- unsigned long ret_ip, struct pt_regs *regs)
+static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
- struct fprobe *fp = (struct fprobe *)data;
- struct fprobe_rethook_node *fpr;
- int bit;
+ struct fprobe_hlist_node *node, *first;
+ unsigned long *fgraph_data = NULL;
+ unsigned long func = trace->func;
+ unsigned long ret_ip;
+ int reserved_words;
+ struct fprobe *fp;
+ int used, ret;
- if (!fp || fprobe_disabled(fp))
- return;
+ if (WARN_ON_ONCE(!fregs))
+ return 0;
+
+ first = node = find_first_fprobe_node(func);
+ if (unlikely(!first))
+ return 0;
- fpr = container_of(rh, struct fprobe_rethook_node, node);
+ reserved_words = 0;
+ hlist_for_each_entry_from_rcu(node, hlist) {
+ if (node->addr != func)
+ break;
+ fp = READ_ONCE(node->fp);
+ if (!fp || !fp->exit_handler)
+ continue;
+ /*
+ * Since fprobe can be enabled until the next loop, we ignore the
+ * fprobe's disabled flag in this loop.
+ */
+ reserved_words +=
+ FPROBE_HEADER_SIZE_IN_LONG + SIZE_IN_LONG(fp->entry_data_size);
+ }
+ node = first;
+ if (reserved_words) {
+ fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long));
+ if (unlikely(!fgraph_data)) {
+ hlist_for_each_entry_from_rcu(node, hlist) {
+ if (node->addr != func)
+ break;
+ fp = READ_ONCE(node->fp);
+ if (fp && !fprobe_disabled(fp))
+ fp->nmissed++;
+ }
+ return 0;
+ }
+ }
/*
- * we need to assure no calls to traceable functions in-between the
- * end of fprobe_handler and the beginning of fprobe_exit_handler.
+ * TODO: recursion detection has been done in the fgraph. Thus we need
+ * to add a callback to increment missed counter.
*/
- bit = ftrace_test_recursion_trylock(fpr->entry_ip, fpr->entry_parent_ip);
- if (bit < 0) {
- fp->nmissed++;
+ ret_ip = ftrace_regs_get_return_address(fregs);
+ used = 0;
+ hlist_for_each_entry_from_rcu(node, hlist) {
+ int data_size;
+ void *data;
+
+ if (node->addr != func)
+ break;
+ fp = READ_ONCE(node->fp);
+ if (!fp || fprobe_disabled(fp))
+ continue;
+
+ data_size = fp->entry_data_size;
+ if (data_size && fp->exit_handler)
+ data = fgraph_data + used + FPROBE_HEADER_SIZE_IN_LONG;
+ else
+ data = NULL;
+
+ if (fprobe_shared_with_kprobes(fp))
+ ret = __fprobe_kprobe_handler(func, ret_ip, fp, fregs, data);
+ else
+ ret = __fprobe_handler(func, ret_ip, fp, fregs, data);
+
+ /* If entry_handler returns !0, nmissed is not counted but skips exit_handler. */
+ if (!ret && fp->exit_handler) {
+ int size_words = SIZE_IN_LONG(data_size);
+
+ if (write_fprobe_header(&fgraph_data[used], fp, size_words))
+ used += FPROBE_HEADER_SIZE_IN_LONG + size_words;
+ }
+ }
+ if (used < reserved_words)
+ memset(fgraph_data + used, 0, reserved_words - used);
+
+ /* If any exit_handler is set, data must be used. */
+ return used != 0;
+}
+NOKPROBE_SYMBOL(fprobe_entry);
+
+static void fprobe_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
+{
+ unsigned long *fgraph_data = NULL;
+ unsigned long ret_ip;
+ struct fprobe *fp;
+ int size, curr;
+ int size_words;
+
+ fgraph_data = (unsigned long *)fgraph_retrieve_data(gops->idx, &size);
+ if (WARN_ON_ONCE(!fgraph_data))
return;
+ size_words = SIZE_IN_LONG(size);
+ ret_ip = ftrace_regs_get_instruction_pointer(fregs);
+
+ preempt_disable();
+
+ curr = 0;
+ while (size_words > curr) {
+ read_fprobe_header(&fgraph_data[curr], &fp, &size);
+ if (!fp)
+ break;
+ curr += FPROBE_HEADER_SIZE_IN_LONG;
+ if (is_fprobe_still_exist(fp) && !fprobe_disabled(fp)) {
+ if (WARN_ON_ONCE(curr + size > size_words))
+ break;
+ fp->exit_handler(fp, trace->func, ret_ip, fregs,
+ size ? fgraph_data + curr : NULL);
+ }
+ curr += size;
+ }
+ preempt_enable();
+}
+NOKPROBE_SYMBOL(fprobe_return);
+
+static struct fgraph_ops fprobe_graph_ops = {
+ .entryfunc = fprobe_entry,
+ .retfunc = fprobe_return,
+};
+static int fprobe_graph_active;
+
+/* Add @addrs to the ftrace filter and register fgraph if needed. */
+static int fprobe_graph_add_ips(unsigned long *addrs, int num)
+{
+ int ret;
+
+ lockdep_assert_held(&fprobe_mutex);
+
+ ret = ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 0, 0);
+ if (ret)
+ return ret;
+
+ if (!fprobe_graph_active) {
+ ret = register_ftrace_graph(&fprobe_graph_ops);
+ if (WARN_ON_ONCE(ret)) {
+ ftrace_free_filter(&fprobe_graph_ops.ops);
+ return ret;
+ }
}
+ fprobe_graph_active++;
+ return 0;
+}
- fp->exit_handler(fp, fpr->entry_ip, ret_ip, regs,
- fp->entry_data_size ? (void *)fpr->data : NULL);
- ftrace_test_recursion_unlock(bit);
+/* Remove @addrs from the ftrace filter and unregister fgraph if possible. */
+static void fprobe_graph_remove_ips(unsigned long *addrs, int num)
+{
+ lockdep_assert_held(&fprobe_mutex);
+
+ fprobe_graph_active--;
+ /* Q: should we unregister it ? */
+ if (!fprobe_graph_active)
+ unregister_ftrace_graph(&fprobe_graph_ops);
+
+ if (num)
+ ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0);
}
-NOKPROBE_SYMBOL(fprobe_exit_handler);
static int symbols_cmp(const void *a, const void *b)
{
@@ -175,53 +439,97 @@ static unsigned long *get_ftrace_locations(const char **syms, int num)
return ERR_PTR(-ENOENT);
}
-static void fprobe_init(struct fprobe *fp)
-{
- fp->nmissed = 0;
- if (fprobe_shared_with_kprobes(fp))
- fp->ops.func = fprobe_kprobe_handler;
- else
- fp->ops.func = fprobe_handler;
- fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS;
-}
+struct filter_match_data {
+ const char *filter;
+ const char *notfilter;
+ size_t index;
+ size_t size;
+ unsigned long *addrs;
+};
-static int fprobe_init_rethook(struct fprobe *fp, int num)
+static int filter_match_callback(void *data, const char *name, unsigned long addr)
{
- int size;
+ struct filter_match_data *match = data;
- if (!fp->exit_handler) {
- fp->rethook = NULL;
+ if (!glob_match(match->filter, name) ||
+ (match->notfilter && glob_match(match->notfilter, name)))
return 0;
- }
- /* Initialize rethook if needed */
- if (fp->nr_maxactive)
- num = fp->nr_maxactive;
- else
- num *= num_possible_cpus() * 2;
- if (num <= 0)
- return -EINVAL;
+ if (!ftrace_location(addr))
+ return 0;
+
+ if (match->addrs)
+ match->addrs[match->index] = addr;
- size = sizeof(struct fprobe_rethook_node) + fp->entry_data_size;
+ match->index++;
+ return match->index == match->size;
+}
- /* Initialize rethook */
- fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler, size, num);
- if (IS_ERR(fp->rethook))
- return PTR_ERR(fp->rethook);
+/*
+ * Make IP list from the filter/no-filter glob patterns.
+ * Return the number of matched symbols, or -ENOENT.
+ */
+static int ip_list_from_filter(const char *filter, const char *notfilter,
+ unsigned long *addrs, size_t size)
+{
+ struct filter_match_data match = { .filter = filter, .notfilter = notfilter,
+ .index = 0, .size = size, .addrs = addrs};
+ int ret;
- return 0;
+ ret = kallsyms_on_each_symbol(filter_match_callback, &match);
+ if (ret < 0)
+ return ret;
+ ret = module_kallsyms_on_each_symbol(NULL, filter_match_callback, &match);
+ if (ret < 0)
+ return ret;
+
+ return match.index ?: -ENOENT;
}
static void fprobe_fail_cleanup(struct fprobe *fp)
{
- if (!IS_ERR_OR_NULL(fp->rethook)) {
- /* Don't need to cleanup rethook->handler because this is not used. */
- rethook_free(fp->rethook);
- fp->rethook = NULL;
+ kfree(fp->hlist_array);
+ fp->hlist_array = NULL;
+}
+
+/* Initialize the fprobe data structure. */
+static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num)
+{
+ struct fprobe_hlist *hlist_array;
+ unsigned long addr;
+ int size, i;
+
+ if (!fp || !addrs || num <= 0)
+ return -EINVAL;
+
+ size = ALIGN(fp->entry_data_size, sizeof(long));
+ if (size > MAX_FPROBE_DATA_SIZE)
+ return -E2BIG;
+ fp->entry_data_size = size;
+
+ hlist_array = kzalloc(struct_size(hlist_array, array, num), GFP_KERNEL);
+ if (!hlist_array)
+ return -ENOMEM;
+
+ fp->nmissed = 0;
+
+ hlist_array->size = num;
+ fp->hlist_array = hlist_array;
+ hlist_array->fp = fp;
+ for (i = 0; i < num; i++) {
+ hlist_array->array[i].fp = fp;
+ addr = ftrace_location(addrs[i]);
+ if (!addr) {
+ fprobe_fail_cleanup(fp);
+ return -ENOENT;
+ }
+ hlist_array->array[i].addr = addr;
}
- ftrace_free_filter(&fp->ops);
+ return 0;
}
+#define FPROBE_IPS_MAX INT_MAX
+
/**
* register_fprobe() - Register fprobe to ftrace by pattern.
* @fp: A fprobe data structure to be registered.
@@ -235,46 +543,24 @@ static void fprobe_fail_cleanup(struct fprobe *fp)
*/
int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter)
{
- struct ftrace_hash *hash;
- unsigned char *str;
- int ret, len;
+ unsigned long *addrs;
+ int ret;
if (!fp || !filter)
return -EINVAL;
- fprobe_init(fp);
-
- len = strlen(filter);
- str = kstrdup(filter, GFP_KERNEL);
- ret = ftrace_set_filter(&fp->ops, str, len, 0);
- kfree(str);
- if (ret)
+ ret = ip_list_from_filter(filter, notfilter, NULL, FPROBE_IPS_MAX);
+ if (ret < 0)
return ret;
- if (notfilter) {
- len = strlen(notfilter);
- str = kstrdup(notfilter, GFP_KERNEL);
- ret = ftrace_set_notrace(&fp->ops, str, len, 0);
- kfree(str);
- if (ret)
- goto out;
- }
-
- /* TODO:
- * correctly calculate the total number of filtered symbols
- * from both filter and notfilter.
- */
- hash = rcu_access_pointer(fp->ops.local_hash.filter_hash);
- if (WARN_ON_ONCE(!hash))
- goto out;
-
- ret = fprobe_init_rethook(fp, (int)hash->count);
- if (!ret)
- ret = register_ftrace_function(&fp->ops);
+ addrs = kcalloc(ret, sizeof(unsigned long), GFP_KERNEL);
+ if (!addrs)
+ return -ENOMEM;
+ ret = ip_list_from_filter(filter, notfilter, addrs, ret);
+ if (ret > 0)
+ ret = register_fprobe_ips(fp, addrs, ret);
-out:
- if (ret)
- fprobe_fail_cleanup(fp);
+ kfree(addrs);
return ret;
}
EXPORT_SYMBOL_GPL(register_fprobe);
@@ -282,7 +568,7 @@ EXPORT_SYMBOL_GPL(register_fprobe);
/**
* register_fprobe_ips() - Register fprobe to ftrace by address.
* @fp: A fprobe data structure to be registered.
- * @addrs: An array of target ftrace location addresses.
+ * @addrs: An array of target function address.
* @num: The number of entries of @addrs.
*
* Register @fp to ftrace for enabling the probe on the address given by @addrs.
@@ -294,23 +580,27 @@ EXPORT_SYMBOL_GPL(register_fprobe);
*/
int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
{
- int ret;
-
- if (!fp || !addrs || num <= 0)
- return -EINVAL;
-
- fprobe_init(fp);
+ struct fprobe_hlist *hlist_array;
+ int ret, i;
- ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0);
+ ret = fprobe_init(fp, addrs, num);
if (ret)
return ret;
- ret = fprobe_init_rethook(fp, num);
- if (!ret)
- ret = register_ftrace_function(&fp->ops);
+ mutex_lock(&fprobe_mutex);
+
+ hlist_array = fp->hlist_array;
+ ret = fprobe_graph_add_ips(addrs, num);
+ if (!ret) {
+ add_fprobe_hash(fp);
+ for (i = 0; i < hlist_array->size; i++)
+ insert_fprobe_node(&hlist_array->array[i]);
+ }
+ mutex_unlock(&fprobe_mutex);
if (ret)
fprobe_fail_cleanup(fp);
+
return ret;
}
EXPORT_SYMBOL_GPL(register_fprobe_ips);
@@ -348,14 +638,13 @@ EXPORT_SYMBOL_GPL(register_fprobe_syms);
bool fprobe_is_registered(struct fprobe *fp)
{
- if (!fp || (fp->ops.saved_func != fprobe_handler &&
- fp->ops.saved_func != fprobe_kprobe_handler))
+ if (!fp || !fp->hlist_array)
return false;
return true;
}
/**
- * unregister_fprobe() - Unregister fprobe from ftrace
+ * unregister_fprobe() - Unregister fprobe.
* @fp: A fprobe data structure to be unregistered.
*
* Unregister fprobe (and remove ftrace hooks from the function entries).
@@ -364,23 +653,40 @@ bool fprobe_is_registered(struct fprobe *fp)
*/
int unregister_fprobe(struct fprobe *fp)
{
- int ret;
+ struct fprobe_hlist *hlist_array;
+ unsigned long *addrs = NULL;
+ int ret = 0, i, count;
- if (!fprobe_is_registered(fp))
- return -EINVAL;
+ mutex_lock(&fprobe_mutex);
+ if (!fp || !is_fprobe_still_exist(fp)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ hlist_array = fp->hlist_array;
+ addrs = kcalloc(hlist_array->size, sizeof(unsigned long), GFP_KERNEL);
+ if (!addrs) {
+ ret = -ENOMEM; /* TODO: Fallback to one-by-one loop */
+ goto out;
+ }
- if (!IS_ERR_OR_NULL(fp->rethook))
- rethook_stop(fp->rethook);
+ /* Remove non-synonim ips from table and hash */
+ count = 0;
+ for (i = 0; i < hlist_array->size; i++) {
+ if (!delete_fprobe_node(&hlist_array->array[i]))
+ addrs[count++] = hlist_array->array[i].addr;
+ }
+ del_fprobe_hash(fp);
- ret = unregister_ftrace_function(&fp->ops);
- if (ret < 0)
- return ret;
+ fprobe_graph_remove_ips(addrs, count);
- if (!IS_ERR_OR_NULL(fp->rethook))
- rethook_free(fp->rethook);
+ kfree_rcu(hlist_array, rcu);
+ fp->hlist_array = NULL;
- ftrace_free_filter(&fp->ops);
+out:
+ mutex_unlock(&fprobe_mutex);
+ kfree(addrs);
return ret;
}
EXPORT_SYMBOL_GPL(unregister_fprobe);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 83ba342aef31..fc88e0688daf 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -74,7 +74,8 @@
#ifdef CONFIG_DYNAMIC_FTRACE
#define INIT_OPS_HASH(opsname) \
.func_hash = &opsname.local_hash, \
- .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
+ .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), \
+ .subop_list = LIST_HEAD_INIT(opsname.subop_list),
#else
#define INIT_OPS_HASH(opsname)
#endif
@@ -99,7 +100,7 @@ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
/* What to set function_trace_op to */
static struct ftrace_ops *set_function_trace_op;
-static bool ftrace_pids_enabled(struct ftrace_ops *ops)
+bool ftrace_pids_enabled(struct ftrace_ops *ops)
{
struct trace_array *tr;
@@ -121,7 +122,7 @@ static int ftrace_disabled __read_mostly;
DEFINE_MUTEX(ftrace_lock);
-struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end;
+struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = (struct ftrace_ops __rcu *)&ftrace_list_end;
ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
struct ftrace_ops global_ops;
@@ -161,12 +162,14 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops)
#ifdef CONFIG_DYNAMIC_FTRACE
if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) {
mutex_init(&ops->local_hash.regex_lock);
+ INIT_LIST_HEAD(&ops->subop_list);
ops->func_hash = &ops->local_hash;
ops->flags |= FTRACE_OPS_FL_INITIALIZED;
}
#endif
}
+/* Call this function for when a callback filters on set_ftrace_pid */
static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs)
{
@@ -235,8 +238,6 @@ static void update_ftrace_function(void)
func = ftrace_ops_list_func;
}
- update_function_graph_func();
-
/* If there's no change, then do nothing more here */
if (ftrace_trace_function == func)
return;
@@ -310,7 +311,7 @@ static int remove_ftrace_ops(struct ftrace_ops __rcu **list,
lockdep_is_held(&ftrace_lock)) == ops &&
rcu_dereference_protected(ops->next,
lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) {
- *list = &ftrace_list_end;
+ rcu_assign_pointer(*list, &ftrace_list_end);
return 0;
}
@@ -406,6 +407,8 @@ static void ftrace_update_pid_func(void)
}
} while_for_each_ftrace_op(op);
+ fgraph_update_pid_func();
+
update_ftrace_function();
}
@@ -533,24 +536,22 @@ static int function_stat_show(struct seq_file *m, void *v)
{
struct ftrace_profile *rec = v;
char str[KSYM_SYMBOL_LEN];
- int ret = 0;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static struct trace_seq s;
unsigned long long avg;
unsigned long long stddev;
+ unsigned long long stddev_denom;
#endif
- mutex_lock(&ftrace_profile_lock);
+ guard(mutex)(&ftrace_profile_lock);
/* we raced with function_profile_reset() */
- if (unlikely(rec->counter == 0)) {
- ret = -EBUSY;
- goto out;
- }
+ if (unlikely(rec->counter == 0))
+ return -EBUSY;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
avg = div64_ul(rec->time, rec->counter);
if (tracing_thresh && (avg < tracing_thresh))
- goto out;
+ return 0;
#endif
kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
@@ -559,23 +560,19 @@ static int function_stat_show(struct seq_file *m, void *v)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
seq_puts(m, " ");
- /* Sample standard deviation (s^2) */
- if (rec->counter <= 1)
- stddev = 0;
- else {
- /*
- * Apply Welford's method:
- * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2)
- */
+ /*
+ * Variance formula:
+ * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2)
+ * Maybe Welford's method is better here?
+ * Divide only by 1000 for ns^2 -> us^2 conversion.
+ * trace_print_graph_duration will divide by 1000 again.
+ */
+ stddev = 0;
+ stddev_denom = rec->counter * (rec->counter - 1) * 1000;
+ if (stddev_denom) {
stddev = rec->counter * rec->time_squared -
rec->time * rec->time;
-
- /*
- * Divide only 1000 for ns^2 -> us^2 conversion.
- * trace_print_graph_duration will divide 1000 again.
- */
- stddev = div64_ul(stddev,
- rec->counter * (rec->counter - 1) * 1000);
+ stddev = div64_ul(stddev, stddev_denom);
}
trace_seq_init(&s);
@@ -587,10 +584,8 @@ static int function_stat_show(struct seq_file *m, void *v)
trace_print_seq(m, &s);
#endif
seq_putc(m, '\n');
-out:
- mutex_unlock(&ftrace_profile_lock);
- return ret;
+ return 0;
}
static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
@@ -786,27 +781,24 @@ function_profile_call(unsigned long ip, unsigned long parent_ip,
{
struct ftrace_profile_stat *stat;
struct ftrace_profile *rec;
- unsigned long flags;
if (!ftrace_profile_enabled)
return;
- local_irq_save(flags);
+ guard(preempt_notrace)();
stat = this_cpu_ptr(&ftrace_profile_stats);
if (!stat->hash || !ftrace_profile_enabled)
- goto out;
+ return;
rec = ftrace_find_profiled_func(stat, ip);
if (!rec) {
rec = ftrace_profile_alloc(stat, ip);
if (!rec)
- goto out;
+ return;
}
rec->counter++;
- out:
- local_irq_restore(flags);
}
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -817,9 +809,17 @@ void ftrace_graph_graph_time_control(bool enable)
fgraph_graph_time = enable;
}
-static int profile_graph_entry(struct ftrace_graph_ent *trace)
+struct profile_fgraph_data {
+ unsigned long long calltime;
+ unsigned long long subtime;
+ unsigned long long sleeptime;
+};
+
+static int profile_graph_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
- struct ftrace_ret_stack *ret_stack;
+ struct profile_fgraph_data *profile_data;
function_profile_call(trace->func, 0, NULL, NULL);
@@ -827,42 +827,57 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace)
if (!current->ret_stack)
return 0;
- ret_stack = ftrace_graph_get_ret_stack(current, 0);
- if (ret_stack)
- ret_stack->subtime = 0;
+ profile_data = fgraph_reserve_data(gops->idx, sizeof(*profile_data));
+ if (!profile_data)
+ return 0;
+
+ profile_data->subtime = 0;
+ profile_data->sleeptime = current->ftrace_sleeptime;
+ profile_data->calltime = trace_clock_local();
return 1;
}
-static void profile_graph_return(struct ftrace_graph_ret *trace)
+static void profile_graph_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
- struct ftrace_ret_stack *ret_stack;
+ struct profile_fgraph_data *profile_data;
struct ftrace_profile_stat *stat;
unsigned long long calltime;
+ unsigned long long rettime = trace_clock_local();
struct ftrace_profile *rec;
- unsigned long flags;
+ int size;
+
+ guard(preempt_notrace)();
- local_irq_save(flags);
stat = this_cpu_ptr(&ftrace_profile_stats);
if (!stat->hash || !ftrace_profile_enabled)
- goto out;
+ return;
+
+ profile_data = fgraph_retrieve_data(gops->idx, &size);
/* If the calltime was zero'd ignore it */
- if (!trace->calltime)
- goto out;
+ if (!profile_data || !profile_data->calltime)
+ return;
+
+ calltime = rettime - profile_data->calltime;
- calltime = trace->rettime - trace->calltime;
+ if (!fgraph_sleep_time) {
+ if (current->ftrace_sleeptime)
+ calltime -= current->ftrace_sleeptime - profile_data->sleeptime;
+ }
if (!fgraph_graph_time) {
+ struct profile_fgraph_data *parent_data;
/* Append this call time to the parent time to subtract */
- ret_stack = ftrace_graph_get_ret_stack(current, 1);
- if (ret_stack)
- ret_stack->subtime += calltime;
+ parent_data = fgraph_retrieve_parent_data(gops->idx, &size, 1);
+ if (parent_data)
+ parent_data->subtime += calltime;
- ret_stack = ftrace_graph_get_ret_stack(current, 0);
- if (ret_stack && ret_stack->subtime < calltime)
- calltime -= ret_stack->subtime;
+ if (profile_data->subtime && profile_data->subtime < calltime)
+ calltime -= profile_data->subtime;
else
calltime = 0;
}
@@ -872,9 +887,6 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
rec->time += calltime;
rec->time_squared += calltime * calltime;
}
-
- out:
- local_irq_restore(flags);
}
static struct fgraph_ops fprofiler_ops = {
@@ -884,6 +896,7 @@ static struct fgraph_ops fprofiler_ops = {
static int register_ftrace_profiler(void)
{
+ ftrace_ops_set_global_filter(&fprofiler_ops.ops);
return register_ftrace_graph(&fprofiler_ops);
}
@@ -894,12 +907,11 @@ static void unregister_ftrace_profiler(void)
#else
static struct ftrace_ops ftrace_profile_ops __read_mostly = {
.func = function_profile_call,
- .flags = FTRACE_OPS_FL_INITIALIZED,
- INIT_OPS_HASH(ftrace_profile_ops)
};
static int register_ftrace_profiler(void)
{
+ ftrace_ops_set_global_filter(&ftrace_profile_ops);
return register_ftrace_function(&ftrace_profile_ops);
}
@@ -922,20 +934,16 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
val = !!val;
- mutex_lock(&ftrace_profile_lock);
+ guard(mutex)(&ftrace_profile_lock);
if (ftrace_profile_enabled ^ val) {
if (val) {
ret = ftrace_profile_init();
- if (ret < 0) {
- cnt = ret;
- goto out;
- }
+ if (ret < 0)
+ return ret;
ret = register_ftrace_profiler();
- if (ret < 0) {
- cnt = ret;
- goto out;
- }
+ if (ret < 0)
+ return ret;
ftrace_profile_enabled = 1;
} else {
ftrace_profile_enabled = 0;
@@ -946,8 +954,6 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
unregister_ftrace_profiler();
}
}
- out:
- mutex_unlock(&ftrace_profile_lock);
*ppos += cnt;
@@ -1160,7 +1166,7 @@ __ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
* Search a given @hash to see if a given instruction pointer (@ip)
* exists in it.
*
- * Returns the entry that holds the @ip if found. NULL otherwise.
+ * Returns: the entry that holds the @ip if found. NULL otherwise.
*/
struct ftrace_func_entry *
ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
@@ -1282,7 +1288,7 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
/**
* ftrace_free_filter - remove all filters for an ftrace_ops
- * @ops - the ops to remove the filters from
+ * @ops: the ops to remove the filters from
*/
void ftrace_free_filter(struct ftrace_ops *ops)
{
@@ -1314,7 +1320,7 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
return hash;
}
-
+/* Used to save filters on functions for modules not loaded yet */
static int ftrace_add_mod(struct trace_array *tr,
const char *func, const char *module,
int enable)
@@ -1380,15 +1386,17 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
return NULL;
}
-static void
-ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash);
-static void
-ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
+static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops);
+static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops);
static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
struct ftrace_hash *new_hash);
-static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size)
+/*
+ * Allocate a new hash and remove entries from @src and move them to the new hash.
+ * On success, the @src hash will be empty and should be freed.
+ */
+static struct ftrace_hash *__move_hash(struct ftrace_hash *src, int size)
{
struct ftrace_func_entry *entry;
struct ftrace_hash *new_hash;
@@ -1424,6 +1432,7 @@ static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size)
return new_hash;
}
+/* Move the @src entries to a newly allocated hash */
static struct ftrace_hash *
__ftrace_hash_move(struct ftrace_hash *src)
{
@@ -1435,9 +1444,29 @@ __ftrace_hash_move(struct ftrace_hash *src)
if (ftrace_hash_empty(src))
return EMPTY_HASH;
- return dup_hash(src, size);
+ return __move_hash(src, size);
}
+/**
+ * ftrace_hash_move - move a new hash to a filter and do updates
+ * @ops: The ops with the hash that @dst points to
+ * @enable: True if for the filter hash, false for the notrace hash
+ * @dst: Points to the @ops hash that should be updated
+ * @src: The hash to update @dst with
+ *
+ * This is called when an ftrace_ops hash is being updated and the
+ * the kernel needs to reflect this. Note, this only updates the kernel
+ * function callbacks if the @ops is enabled (not to be confused with
+ * @enable above). If the @ops is enabled, its hash determines what
+ * callbacks get called. This function gets called when the @ops hash
+ * is updated and it requires new callbacks.
+ *
+ * On success the elements of @src is moved to @dst, and @dst is updated
+ * properly, as well as the functions determined by the @ops hashes
+ * are now calling the @ops callback function.
+ *
+ * Regardless of return type, @src should be freed with free_ftrace_hash().
+ */
static int
ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct ftrace_hash **dst, struct ftrace_hash *src)
@@ -1467,11 +1496,11 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
* Remove the current set, update the hash and add
* them back.
*/
- ftrace_hash_rec_disable_modify(ops, enable);
+ ftrace_hash_rec_disable_modify(ops);
rcu_assign_pointer(*dst, new_hash);
- ftrace_hash_rec_enable_modify(ops, enable);
+ ftrace_hash_rec_enable_modify(ops);
return 0;
}
@@ -1587,7 +1616,7 @@ static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end)
* @end: end of range to search (inclusive). @end points to the last byte
* to check.
*
- * Returns rec->ip if the related ftrace location is a least partly within
+ * Returns: rec->ip if the related ftrace location is a least partly within
* the given address range. That is, the first address of the instruction
* that is either a NOP or call to the function tracer. It checks the ftrace
* internal tables to determine if the address belongs or not.
@@ -1595,43 +1624,42 @@ static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end)
unsigned long ftrace_location_range(unsigned long start, unsigned long end)
{
struct dyn_ftrace *rec;
+ unsigned long ip = 0;
+ rcu_read_lock();
rec = lookup_rec(start, end);
if (rec)
- return rec->ip;
+ ip = rec->ip;
+ rcu_read_unlock();
- return 0;
+ return ip;
}
/**
* ftrace_location - return the ftrace location
* @ip: the instruction pointer to check
*
- * If @ip matches the ftrace location, return @ip.
- * If @ip matches sym+0, return sym's ftrace location.
- * Otherwise, return 0.
+ * Returns:
+ * * If @ip matches the ftrace location, return @ip.
+ * * If @ip matches sym+0, return sym's ftrace location.
+ * * Otherwise, return 0.
*/
unsigned long ftrace_location(unsigned long ip)
{
- struct dyn_ftrace *rec;
+ unsigned long loc;
unsigned long offset;
unsigned long size;
- rec = lookup_rec(ip, ip);
- if (!rec) {
+ loc = ftrace_location_range(ip, ip);
+ if (!loc) {
if (!kallsyms_lookup_size_offset(ip, &size, &offset))
- goto out;
+ return 0;
/* map sym+0 to __fentry__ */
if (!offset)
- rec = lookup_rec(ip, ip + size - 1);
+ loc = ftrace_location_range(ip, ip + size - 1);
}
-
- if (rec)
- return rec->ip;
-
-out:
- return 0;
+ return loc;
}
/**
@@ -1639,7 +1667,7 @@ out:
* @start: start of range to search
* @end: end of range to search (inclusive). @end points to the last byte to check.
*
- * Returns 1 if @start and @end contains a ftrace location.
+ * Returns: 1 if @start and @end contains a ftrace location.
* That is, the instruction that is either a NOP or call to
* the function tracer. It checks the ftrace internal tables to
* determine if the address belongs or not.
@@ -1693,12 +1721,21 @@ static bool skip_record(struct dyn_ftrace *rec)
!(rec->flags & FTRACE_FL_ENABLED);
}
+/*
+ * This is the main engine to the ftrace updates to the dyn_ftrace records.
+ *
+ * It will iterate through all the available ftrace functions
+ * (the ones that ftrace can have callbacks to) and set the flags
+ * in the associated dyn_ftrace records.
+ *
+ * @inc: If true, the functions associated to @ops are added to
+ * the dyn_ftrace records, otherwise they are removed.
+ */
static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
- int filter_hash,
bool inc)
{
struct ftrace_hash *hash;
- struct ftrace_hash *other_hash;
+ struct ftrace_hash *notrace_hash;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
bool update = false;
@@ -1710,35 +1747,16 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
return false;
/*
- * In the filter_hash case:
* If the count is zero, we update all records.
* Otherwise we just update the items in the hash.
- *
- * In the notrace_hash case:
- * We enable the update in the hash.
- * As disabling notrace means enabling the tracing,
- * and enabling notrace means disabling, the inc variable
- * gets inversed.
*/
- if (filter_hash) {
- hash = ops->func_hash->filter_hash;
- other_hash = ops->func_hash->notrace_hash;
- if (ftrace_hash_empty(hash))
- all = true;
- } else {
- inc = !inc;
- hash = ops->func_hash->notrace_hash;
- other_hash = ops->func_hash->filter_hash;
- /*
- * If the notrace hash has no items,
- * then there's nothing to do.
- */
- if (ftrace_hash_empty(hash))
- return false;
- }
+ hash = ops->func_hash->filter_hash;
+ notrace_hash = ops->func_hash->notrace_hash;
+ if (ftrace_hash_empty(hash))
+ all = true;
do_for_each_ftrace_rec(pg, rec) {
- int in_other_hash = 0;
+ int in_notrace_hash = 0;
int in_hash = 0;
int match = 0;
@@ -1750,26 +1768,17 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
* Only the filter_hash affects all records.
* Update if the record is not in the notrace hash.
*/
- if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
+ if (!notrace_hash || !ftrace_lookup_ip(notrace_hash, rec->ip))
match = 1;
} else {
in_hash = !!ftrace_lookup_ip(hash, rec->ip);
- in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);
+ in_notrace_hash = !!ftrace_lookup_ip(notrace_hash, rec->ip);
/*
- * If filter_hash is set, we want to match all functions
- * that are in the hash but not in the other hash.
- *
- * If filter_hash is not set, then we are decrementing.
- * That means we match anything that is in the hash
- * and also in the other_hash. That is, we need to turn
- * off functions in the other hash because they are disabled
- * by this hash.
+ * We want to match all functions that are in the hash but
+ * not in the other hash.
*/
- if (filter_hash && in_hash && !in_other_hash)
- match = 1;
- else if (!filter_hash && in_hash &&
- (in_other_hash || ftrace_hash_empty(other_hash)))
+ if (in_hash && !in_notrace_hash)
match = 1;
}
if (!match)
@@ -1875,24 +1884,48 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
return update;
}
-static bool ftrace_hash_rec_disable(struct ftrace_ops *ops,
- int filter_hash)
+/*
+ * This is called when an ops is removed from tracing. It will decrement
+ * the counters of the dyn_ftrace records for all the functions that
+ * the @ops attached to.
+ */
+static bool ftrace_hash_rec_disable(struct ftrace_ops *ops)
{
- return __ftrace_hash_rec_update(ops, filter_hash, 0);
+ return __ftrace_hash_rec_update(ops, false);
}
-static bool ftrace_hash_rec_enable(struct ftrace_ops *ops,
- int filter_hash)
+/*
+ * This is called when an ops is added to tracing. It will increment
+ * the counters of the dyn_ftrace records for all the functions that
+ * the @ops attached to.
+ */
+static bool ftrace_hash_rec_enable(struct ftrace_ops *ops)
{
- return __ftrace_hash_rec_update(ops, filter_hash, 1);
+ return __ftrace_hash_rec_update(ops, true);
}
-static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops,
- int filter_hash, int inc)
+/*
+ * This function will update what functions @ops traces when its filter
+ * changes.
+ *
+ * The @inc states if the @ops callbacks are going to be added or removed.
+ * When one of the @ops hashes are updated to a "new_hash" the dyn_ftrace
+ * records are update via:
+ *
+ * ftrace_hash_rec_disable_modify(ops);
+ * ops->hash = new_hash
+ * ftrace_hash_rec_enable_modify(ops);
+ *
+ * Where the @ops is removed from all the records it is tracing using
+ * its old hash. The @ops hash is updated to the new hash, and then
+ * the @ops is added back to the records so that it is tracing all
+ * the new functions.
+ */
+static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops, bool inc)
{
struct ftrace_ops *op;
- __ftrace_hash_rec_update(ops, filter_hash, inc);
+ __ftrace_hash_rec_update(ops, inc);
if (ops->func_hash != &global_ops.local_hash)
return;
@@ -1906,20 +1939,18 @@ static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops,
if (op == ops)
continue;
if (op->func_hash == &global_ops.local_hash)
- __ftrace_hash_rec_update(op, filter_hash, inc);
+ __ftrace_hash_rec_update(op, inc);
} while_for_each_ftrace_op(op);
}
-static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops,
- int filter_hash)
+static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops)
{
- ftrace_hash_rec_update_modify(ops, filter_hash, 0);
+ ftrace_hash_rec_update_modify(ops, false);
}
-static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
- int filter_hash)
+static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops)
{
- ftrace_hash_rec_update_modify(ops, filter_hash, 1);
+ ftrace_hash_rec_update_modify(ops, true);
}
/*
@@ -2022,7 +2053,7 @@ rollback:
continue;
if (rec == end)
- goto err_out;
+ return -EBUSY;
in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
@@ -2035,7 +2066,6 @@ rollback:
rec->flags |= FTRACE_FL_IPMODIFY;
} while_for_each_ftrace_rec();
-err_out:
return -EBUSY;
}
@@ -2537,7 +2567,6 @@ ftrace_find_unique_ops(struct dyn_ftrace *rec)
/* Protected by rcu_tasks for reading, and direct_mutex for writing */
static struct ftrace_hash __rcu *direct_functions = EMPTY_HASH;
static DEFINE_MUTEX(direct_mutex);
-int ftrace_direct_func_count;
/*
* Search the direct_functions hash to see if the given instruction pointer
@@ -2574,7 +2603,7 @@ static void call_direct_funcs(unsigned long ip, unsigned long pip,
* wants to convert to a callback that saves all regs. If FTRACE_FL_REGS
* is not set, then it wants to convert to the normal callback.
*
- * Returns the address of the trampoline to set to
+ * Returns: the address of the trampoline to set to
*/
unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
{
@@ -2615,7 +2644,7 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
* a function that saves all the regs. Basically the '_EN' version
* represents the current state of the function.
*
- * Returns the address of the trampoline that is currently being called
+ * Returns: the address of the trampoline that is currently being called
*/
unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
{
@@ -2719,7 +2748,7 @@ struct ftrace_rec_iter {
/**
* ftrace_rec_iter_start - start up iterating over traced functions
*
- * Returns an iterator handle that is used to iterate over all
+ * Returns: an iterator handle that is used to iterate over all
* the records that represent address locations where functions
* are traced.
*
@@ -2751,7 +2780,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_start(void)
* ftrace_rec_iter_next - get the next record to process.
* @iter: The handle to the iterator.
*
- * Returns the next iterator after the given iterator @iter.
+ * Returns: the next iterator after the given iterator @iter.
*/
struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
{
@@ -2776,7 +2805,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
* ftrace_rec_iter_record - get the record at the iterator location
* @iter: The current iterator location
*
- * Returns the record that the current @iter is at.
+ * Returns: the record that the current @iter is at.
*/
struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
{
@@ -3043,7 +3072,7 @@ int ftrace_startup(struct ftrace_ops *ops, int command)
return ret;
}
- if (ftrace_hash_rec_enable(ops, 1))
+ if (ftrace_hash_rec_enable(ops))
command |= FTRACE_UPDATE_CALLS;
ftrace_startup_enable(command);
@@ -3085,7 +3114,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
/* Disabling ipmodify never fails */
ftrace_hash_ipmodify_disable(ops);
- if (ftrace_hash_rec_disable(ops, 1))
+ if (ftrace_hash_rec_disable(ops))
command |= FTRACE_UPDATE_CALLS;
ops->flags &= ~FTRACE_OPS_FL_ENABLED;
@@ -3156,8 +3185,7 @@ out:
* synchronize_rcu_tasks() will wait for those tasks to
* execute and either schedule voluntarily or enter user space.
*/
- if (IS_ENABLED(CONFIG_PREEMPTION))
- synchronize_rcu_tasks();
+ synchronize_rcu_tasks();
ftrace_trampoline_free(ops);
}
@@ -3165,7 +3193,487 @@ out:
return 0;
}
-static u64 ftrace_update_time;
+/* Simply make a copy of @src and return it */
+static struct ftrace_hash *copy_hash(struct ftrace_hash *src)
+{
+ if (ftrace_hash_empty(src))
+ return EMPTY_HASH;
+
+ return alloc_and_copy_ftrace_hash(src->size_bits, src);
+}
+
+/*
+ * Append @new_hash entries to @hash:
+ *
+ * If @hash is the EMPTY_HASH then it traces all functions and nothing
+ * needs to be done.
+ *
+ * If @new_hash is the EMPTY_HASH, then make *hash the EMPTY_HASH so
+ * that it traces everything.
+ *
+ * Otherwise, go through all of @new_hash and add anything that @hash
+ * doesn't already have, to @hash.
+ *
+ * The filter_hash updates uses just the append_hash() function
+ * and the notrace_hash does not.
+ */
+static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash,
+ int size_bits)
+{
+ struct ftrace_func_entry *entry;
+ int size;
+ int i;
+
+ if (*hash) {
+ /* An empty hash does everything */
+ if (ftrace_hash_empty(*hash))
+ return 0;
+ } else {
+ *hash = alloc_ftrace_hash(size_bits);
+ if (!*hash)
+ return -ENOMEM;
+ }
+
+ /* If new_hash has everything make hash have everything */
+ if (ftrace_hash_empty(new_hash)) {
+ free_ftrace_hash(*hash);
+ *hash = EMPTY_HASH;
+ return 0;
+ }
+
+ size = 1 << new_hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &new_hash->buckets[i], hlist) {
+ /* Only add if not already in hash */
+ if (!__ftrace_lookup_ip(*hash, entry->ip) &&
+ add_hash_entry(*hash, entry->ip) == NULL)
+ return -ENOMEM;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Add to @hash only those that are in both @new_hash1 and @new_hash2
+ *
+ * The notrace_hash updates uses just the intersect_hash() function
+ * and the filter_hash does not.
+ */
+static int intersect_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash1,
+ struct ftrace_hash *new_hash2)
+{
+ struct ftrace_func_entry *entry;
+ int size;
+ int i;
+
+ /*
+ * If new_hash1 or new_hash2 is the EMPTY_HASH then make the hash
+ * empty as well as empty for notrace means none are notraced.
+ */
+ if (ftrace_hash_empty(new_hash1) || ftrace_hash_empty(new_hash2)) {
+ free_ftrace_hash(*hash);
+ *hash = EMPTY_HASH;
+ return 0;
+ }
+
+ size = 1 << new_hash1->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &new_hash1->buckets[i], hlist) {
+ /* Only add if in both @new_hash1 and @new_hash2 */
+ if (__ftrace_lookup_ip(new_hash2, entry->ip) &&
+ add_hash_entry(*hash, entry->ip) == NULL)
+ return -ENOMEM;
+ }
+ }
+ /* If nothing intersects, make it the empty set */
+ if (ftrace_hash_empty(*hash)) {
+ free_ftrace_hash(*hash);
+ *hash = EMPTY_HASH;
+ }
+ return 0;
+}
+
+/* Return a new hash that has a union of all @ops->filter_hash entries */
+static struct ftrace_hash *append_hashes(struct ftrace_ops *ops)
+{
+ struct ftrace_hash *new_hash = NULL;
+ struct ftrace_ops *subops;
+ int size_bits;
+ int ret;
+
+ if (ops->func_hash->filter_hash)
+ size_bits = ops->func_hash->filter_hash->size_bits;
+ else
+ size_bits = FTRACE_HASH_DEFAULT_BITS;
+
+ list_for_each_entry(subops, &ops->subop_list, list) {
+ ret = append_hash(&new_hash, subops->func_hash->filter_hash, size_bits);
+ if (ret < 0) {
+ free_ftrace_hash(new_hash);
+ return NULL;
+ }
+ /* Nothing more to do if new_hash is empty */
+ if (ftrace_hash_empty(new_hash))
+ break;
+ }
+ /* Can't return NULL as that means this failed */
+ return new_hash ? : EMPTY_HASH;
+}
+
+/* Make @ops trace evenything except what all its subops do not trace */
+static struct ftrace_hash *intersect_hashes(struct ftrace_ops *ops)
+{
+ struct ftrace_hash *new_hash = NULL;
+ struct ftrace_ops *subops;
+ int size_bits;
+ int ret;
+
+ list_for_each_entry(subops, &ops->subop_list, list) {
+ struct ftrace_hash *next_hash;
+
+ if (!new_hash) {
+ size_bits = subops->func_hash->notrace_hash->size_bits;
+ new_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->notrace_hash);
+ if (!new_hash)
+ return NULL;
+ continue;
+ }
+ size_bits = new_hash->size_bits;
+ next_hash = new_hash;
+ new_hash = alloc_ftrace_hash(size_bits);
+ ret = intersect_hash(&new_hash, next_hash, subops->func_hash->notrace_hash);
+ free_ftrace_hash(next_hash);
+ if (ret < 0) {
+ free_ftrace_hash(new_hash);
+ return NULL;
+ }
+ /* Nothing more to do if new_hash is empty */
+ if (ftrace_hash_empty(new_hash))
+ break;
+ }
+ return new_hash;
+}
+
+static bool ops_equal(struct ftrace_hash *A, struct ftrace_hash *B)
+{
+ struct ftrace_func_entry *entry;
+ int size;
+ int i;
+
+ if (ftrace_hash_empty(A))
+ return ftrace_hash_empty(B);
+
+ if (ftrace_hash_empty(B))
+ return ftrace_hash_empty(A);
+
+ if (A->count != B->count)
+ return false;
+
+ size = 1 << A->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &A->buckets[i], hlist) {
+ if (!__ftrace_lookup_ip(B, entry->ip))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static void ftrace_ops_update_code(struct ftrace_ops *ops,
+ struct ftrace_ops_hash *old_hash);
+
+static int __ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
+ struct ftrace_hash **orig_hash,
+ struct ftrace_hash *hash,
+ int enable)
+{
+ struct ftrace_ops_hash old_hash_ops;
+ struct ftrace_hash *old_hash;
+ int ret;
+
+ old_hash = *orig_hash;
+ old_hash_ops.filter_hash = ops->func_hash->filter_hash;
+ old_hash_ops.notrace_hash = ops->func_hash->notrace_hash;
+ ret = ftrace_hash_move(ops, enable, orig_hash, hash);
+ if (!ret) {
+ ftrace_ops_update_code(ops, &old_hash_ops);
+ free_ftrace_hash_rcu(old_hash);
+ }
+ return ret;
+}
+
+static int ftrace_update_ops(struct ftrace_ops *ops, struct ftrace_hash *filter_hash,
+ struct ftrace_hash *notrace_hash)
+{
+ int ret;
+
+ if (!ops_equal(filter_hash, ops->func_hash->filter_hash)) {
+ ret = __ftrace_hash_move_and_update_ops(ops, &ops->func_hash->filter_hash,
+ filter_hash, 1);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (!ops_equal(notrace_hash, ops->func_hash->notrace_hash)) {
+ ret = __ftrace_hash_move_and_update_ops(ops, &ops->func_hash->notrace_hash,
+ notrace_hash, 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/**
+ * ftrace_startup_subops - enable tracing for subops of an ops
+ * @ops: Manager ops (used to pick all the functions of its subops)
+ * @subops: A new ops to add to @ops
+ * @command: Extra commands to use to enable tracing
+ *
+ * The @ops is a manager @ops that has the filter that includes all the functions
+ * that its list of subops are tracing. Adding a new @subops will add the
+ * functions of @subops to @ops.
+ */
+int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command)
+{
+ struct ftrace_hash *filter_hash;
+ struct ftrace_hash *notrace_hash;
+ struct ftrace_hash *save_filter_hash;
+ struct ftrace_hash *save_notrace_hash;
+ int size_bits;
+ int ret;
+
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ ftrace_ops_init(ops);
+ ftrace_ops_init(subops);
+
+ if (WARN_ON_ONCE(subops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EBUSY;
+
+ /* Make everything canonical (Just in case!) */
+ if (!ops->func_hash->filter_hash)
+ ops->func_hash->filter_hash = EMPTY_HASH;
+ if (!ops->func_hash->notrace_hash)
+ ops->func_hash->notrace_hash = EMPTY_HASH;
+ if (!subops->func_hash->filter_hash)
+ subops->func_hash->filter_hash = EMPTY_HASH;
+ if (!subops->func_hash->notrace_hash)
+ subops->func_hash->notrace_hash = EMPTY_HASH;
+
+ /* For the first subops to ops just enable it normally */
+ if (list_empty(&ops->subop_list)) {
+ /* Just use the subops hashes */
+ filter_hash = copy_hash(subops->func_hash->filter_hash);
+ notrace_hash = copy_hash(subops->func_hash->notrace_hash);
+ if (!filter_hash || !notrace_hash) {
+ free_ftrace_hash(filter_hash);
+ free_ftrace_hash(notrace_hash);
+ return -ENOMEM;
+ }
+
+ save_filter_hash = ops->func_hash->filter_hash;
+ save_notrace_hash = ops->func_hash->notrace_hash;
+
+ ops->func_hash->filter_hash = filter_hash;
+ ops->func_hash->notrace_hash = notrace_hash;
+ list_add(&subops->list, &ops->subop_list);
+ ret = ftrace_startup(ops, command);
+ if (ret < 0) {
+ list_del(&subops->list);
+ ops->func_hash->filter_hash = save_filter_hash;
+ ops->func_hash->notrace_hash = save_notrace_hash;
+ free_ftrace_hash(filter_hash);
+ free_ftrace_hash(notrace_hash);
+ } else {
+ free_ftrace_hash(save_filter_hash);
+ free_ftrace_hash(save_notrace_hash);
+ subops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_SUBOP;
+ subops->managed = ops;
+ }
+ return ret;
+ }
+
+ /*
+ * Here there's already something attached. Here are the rules:
+ * o If either filter_hash is empty then the final stays empty
+ * o Otherwise, the final is a superset of both hashes
+ * o If either notrace_hash is empty then the final stays empty
+ * o Otherwise, the final is an intersection between the hashes
+ */
+ if (ftrace_hash_empty(ops->func_hash->filter_hash) ||
+ ftrace_hash_empty(subops->func_hash->filter_hash)) {
+ filter_hash = EMPTY_HASH;
+ } else {
+ size_bits = max(ops->func_hash->filter_hash->size_bits,
+ subops->func_hash->filter_hash->size_bits);
+ filter_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->filter_hash);
+ if (!filter_hash)
+ return -ENOMEM;
+ ret = append_hash(&filter_hash, subops->func_hash->filter_hash,
+ size_bits);
+ if (ret < 0) {
+ free_ftrace_hash(filter_hash);
+ return ret;
+ }
+ }
+
+ if (ftrace_hash_empty(ops->func_hash->notrace_hash) ||
+ ftrace_hash_empty(subops->func_hash->notrace_hash)) {
+ notrace_hash = EMPTY_HASH;
+ } else {
+ size_bits = max(ops->func_hash->filter_hash->size_bits,
+ subops->func_hash->filter_hash->size_bits);
+ notrace_hash = alloc_ftrace_hash(size_bits);
+ if (!notrace_hash) {
+ free_ftrace_hash(filter_hash);
+ return -ENOMEM;
+ }
+
+ ret = intersect_hash(&notrace_hash, ops->func_hash->filter_hash,
+ subops->func_hash->filter_hash);
+ if (ret < 0) {
+ free_ftrace_hash(filter_hash);
+ free_ftrace_hash(notrace_hash);
+ return ret;
+ }
+ }
+
+ list_add(&subops->list, &ops->subop_list);
+
+ ret = ftrace_update_ops(ops, filter_hash, notrace_hash);
+ free_ftrace_hash(filter_hash);
+ free_ftrace_hash(notrace_hash);
+ if (ret < 0) {
+ list_del(&subops->list);
+ } else {
+ subops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_SUBOP;
+ subops->managed = ops;
+ }
+ return ret;
+}
+
+/**
+ * ftrace_shutdown_subops - Remove a subops from a manager ops
+ * @ops: A manager ops to remove @subops from
+ * @subops: The subops to remove from @ops
+ * @command: Any extra command flags to add to modifying the text
+ *
+ * Removes the functions being traced by the @subops from @ops. Note, it
+ * will not affect functions that are being traced by other subops that
+ * still exist in @ops.
+ *
+ * If the last subops is removed from @ops, then @ops is shutdown normally.
+ */
+int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command)
+{
+ struct ftrace_hash *filter_hash;
+ struct ftrace_hash *notrace_hash;
+ int ret;
+
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ if (WARN_ON_ONCE(!(subops->flags & FTRACE_OPS_FL_ENABLED)))
+ return -EINVAL;
+
+ list_del(&subops->list);
+
+ if (list_empty(&ops->subop_list)) {
+ /* Last one, just disable the current ops */
+
+ ret = ftrace_shutdown(ops, command);
+ if (ret < 0) {
+ list_add(&subops->list, &ops->subop_list);
+ return ret;
+ }
+
+ subops->flags &= ~FTRACE_OPS_FL_ENABLED;
+
+ free_ftrace_hash(ops->func_hash->filter_hash);
+ free_ftrace_hash(ops->func_hash->notrace_hash);
+ ops->func_hash->filter_hash = EMPTY_HASH;
+ ops->func_hash->notrace_hash = EMPTY_HASH;
+ subops->flags &= ~(FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_SUBOP);
+ subops->managed = NULL;
+
+ return 0;
+ }
+
+ /* Rebuild the hashes without subops */
+ filter_hash = append_hashes(ops);
+ notrace_hash = intersect_hashes(ops);
+ if (!filter_hash || !notrace_hash) {
+ free_ftrace_hash(filter_hash);
+ free_ftrace_hash(notrace_hash);
+ list_add(&subops->list, &ops->subop_list);
+ return -ENOMEM;
+ }
+
+ ret = ftrace_update_ops(ops, filter_hash, notrace_hash);
+ if (ret < 0) {
+ list_add(&subops->list, &ops->subop_list);
+ } else {
+ subops->flags &= ~(FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_SUBOP);
+ subops->managed = NULL;
+ }
+ free_ftrace_hash(filter_hash);
+ free_ftrace_hash(notrace_hash);
+ return ret;
+}
+
+static int ftrace_hash_move_and_update_subops(struct ftrace_ops *subops,
+ struct ftrace_hash **orig_subhash,
+ struct ftrace_hash *hash,
+ int enable)
+{
+ struct ftrace_ops *ops = subops->managed;
+ struct ftrace_hash **orig_hash;
+ struct ftrace_hash *save_hash;
+ struct ftrace_hash *new_hash;
+ int ret;
+
+ /* Manager ops can not be subops (yet) */
+ if (WARN_ON_ONCE(!ops || ops->flags & FTRACE_OPS_FL_SUBOP))
+ return -EINVAL;
+
+ /* Move the new hash over to the subops hash */
+ save_hash = *orig_subhash;
+ *orig_subhash = __ftrace_hash_move(hash);
+ if (!*orig_subhash) {
+ *orig_subhash = save_hash;
+ return -ENOMEM;
+ }
+
+ /* Create a new_hash to hold the ops new functions */
+ if (enable) {
+ orig_hash = &ops->func_hash->filter_hash;
+ new_hash = append_hashes(ops);
+ } else {
+ orig_hash = &ops->func_hash->notrace_hash;
+ new_hash = intersect_hashes(ops);
+ }
+
+ /* Move the hash over to the new hash */
+ ret = __ftrace_hash_move_and_update_ops(ops, orig_hash, new_hash, enable);
+
+ free_ftrace_hash(new_hash);
+
+ if (ret) {
+ /* Put back the original hash */
+ free_ftrace_hash_rcu(*orig_subhash);
+ *orig_subhash = save_hash;
+ } else {
+ free_ftrace_hash_rcu(save_hash);
+ }
+ return ret;
+}
+
+
+u64 ftrace_update_time;
+u64 ftrace_total_mod_time;
unsigned long ftrace_update_tot_cnt;
unsigned long ftrace_number_of_pages;
unsigned long ftrace_number_of_groups;
@@ -3185,7 +3693,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
bool init_nop = ftrace_need_init_nop();
struct ftrace_page *pg;
struct dyn_ftrace *p;
- u64 start, stop;
+ u64 start, stop, update_time;
unsigned long update_cnt = 0;
unsigned long rec_flags = 0;
int i;
@@ -3229,7 +3737,11 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
}
stop = ftrace_now(raw_smp_processor_id());
- ftrace_update_time = stop - start;
+ update_time = stop - start;
+ if (mod)
+ ftrace_total_mod_time += update_time;
+ else
+ ftrace_update_time = update_time;
ftrace_update_tot_cnt += update_cnt;
return 0;
@@ -4010,6 +4522,8 @@ ftrace_avail_addrs_open(struct inode *inode, struct file *file)
* ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
* tracing_lseek() should be used as the lseek routine, and
* release must call ftrace_regex_release().
+ *
+ * Returns: 0 on success or a negative errno value on failure
*/
int
ftrace_regex_open(struct ftrace_ops *ops, int flag,
@@ -4199,12 +4713,12 @@ static int
add_rec_by_index(struct ftrace_hash *hash, struct ftrace_glob *func_g,
int clear_filter)
{
- long index = simple_strtoul(func_g->search, NULL, 0);
+ long index;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
/* The index starts at 1 */
- if (--index < 0)
+ if (kstrtoul(func_g->search, 0, &index) || --index < 0)
return 0;
do_for_each_ftrace_rec(pg, rec) {
@@ -4306,15 +4820,13 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
mod_g.len = strlen(mod_g.search);
}
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
if (unlikely(ftrace_disabled))
- goto out_unlock;
+ return 0;
- if (func_g.type == MATCH_INDEX) {
- found = add_rec_by_index(hash, &func_g, clear_filter);
- goto out_unlock;
- }
+ if (func_g.type == MATCH_INDEX)
+ return add_rec_by_index(hash, &func_g, clear_filter);
do_for_each_ftrace_rec(pg, rec) {
@@ -4323,16 +4835,12 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) {
ret = enter_record(hash, rec, clear_filter);
- if (ret < 0) {
- found = ret;
- goto out_unlock;
- }
+ if (ret < 0)
+ return ret;
found = 1;
}
cond_resched();
} while_for_each_ftrace_rec();
- out_unlock:
- mutex_unlock(&ftrace_lock);
return found;
}
@@ -4379,36 +4887,33 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
struct ftrace_hash *hash,
int enable)
{
- struct ftrace_ops_hash old_hash_ops;
- struct ftrace_hash *old_hash;
- int ret;
-
- old_hash = *orig_hash;
- old_hash_ops.filter_hash = ops->func_hash->filter_hash;
- old_hash_ops.notrace_hash = ops->func_hash->notrace_hash;
- ret = ftrace_hash_move(ops, enable, orig_hash, hash);
- if (!ret) {
- ftrace_ops_update_code(ops, &old_hash_ops);
- free_ftrace_hash_rcu(old_hash);
- }
- return ret;
-}
+ if (ops->flags & FTRACE_OPS_FL_SUBOP)
+ return ftrace_hash_move_and_update_subops(ops, orig_hash, hash, enable);
-static bool module_exists(const char *module)
-{
- /* All modules have the symbol __this_module */
- static const char this_mod[] = "__this_module";
- char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2];
- unsigned long val;
- int n;
+ /*
+ * If this ops is not enabled, it could be sharing its filters
+ * with a subop. If that's the case, update the subop instead of
+ * this ops. Shared filters are only allowed to have one ops set
+ * at a time, and if we update the ops that is not enabled,
+ * it will not affect subops that share it.
+ */
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) {
+ struct ftrace_ops *op;
- n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod);
+ /* Check if any other manager subops maps to this hash */
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ struct ftrace_ops *subops;
- if (n > sizeof(modname) - 1)
- return false;
+ list_for_each_entry(subops, &op->subop_list, list) {
+ if ((subops->flags & FTRACE_OPS_FL_ENABLED) &&
+ subops->func_hash == ops->func_hash) {
+ return ftrace_hash_move_and_update_subops(subops, orig_hash, hash, enable);
+ }
+ }
+ } while_for_each_ftrace_op(op);
+ }
- val = module_kallsyms_lookup_name(modname);
- return val != 0;
+ return __ftrace_hash_move_and_update_ops(ops, orig_hash, hash, enable);
}
static int cache_mod(struct trace_array *tr,
@@ -4416,14 +4921,14 @@ static int cache_mod(struct trace_array *tr,
{
struct ftrace_mod_load *ftrace_mod, *n;
struct list_head *head = enable ? &tr->mod_trace : &tr->mod_notrace;
- int ret;
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
/* We do not cache inverse filters */
if (func[0] == '!') {
+ int ret = -EINVAL;
+
func++;
- ret = -EINVAL;
/* Look to remove this hash */
list_for_each_entry_safe(ftrace_mod, n, head, list) {
@@ -4439,26 +4944,17 @@ static int cache_mod(struct trace_array *tr,
continue;
}
}
- goto out;
+ return ret;
}
- ret = -EINVAL;
/* We only care about modules that have not been loaded yet */
if (module_exists(module))
- goto out;
+ return -EINVAL;
/* Save this string off, and execute it when the module is loaded */
- ret = ftrace_add_mod(tr, func, module, enable);
- out:
- mutex_unlock(&ftrace_lock);
-
- return ret;
+ return ftrace_add_mod(tr, func, module, enable);
}
-static int
-ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
- int reset, int enable);
-
#ifdef CONFIG_MODULES
static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
char *mod, bool enable)
@@ -4562,6 +5058,9 @@ ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash,
char *func;
int ret;
+ if (!tr)
+ return -ENODEV;
+
/* match_records() modifies func, and we need the original */
func = kstrdup(func_orig, GFP_KERNEL);
if (!func)
@@ -4626,7 +5125,7 @@ struct ftrace_func_mapper {
/**
* allocate_ftrace_func_mapper - allocate a new ftrace_func_mapper
*
- * Returns a ftrace_func_mapper descriptor that can be used to map ips to data.
+ * Returns: a ftrace_func_mapper descriptor that can be used to map ips to data.
*/
struct ftrace_func_mapper *allocate_ftrace_func_mapper(void)
{
@@ -4646,7 +5145,7 @@ struct ftrace_func_mapper *allocate_ftrace_func_mapper(void)
* @mapper: The mapper that has the ip maps
* @ip: the instruction pointer to find the data for
*
- * Returns the data mapped to @ip if found otherwise NULL. The return
+ * Returns: the data mapped to @ip if found otherwise NULL. The return
* is actually the address of the mapper data pointer. The address is
* returned for use cases where the data is no bigger than a long, and
* the user can use the data pointer as its data instead of having to
@@ -4672,7 +5171,7 @@ void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper,
* @ip: The instruction pointer address to map @data to
* @data: The data to map to @ip
*
- * Returns 0 on success otherwise an error.
+ * Returns: 0 on success otherwise an error.
*/
int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
unsigned long ip, void *data)
@@ -4701,7 +5200,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper,
* @mapper: The mapper that has the ip maps
* @ip: The instruction pointer address to remove the data from
*
- * Returns the data if it is found, otherwise NULL.
+ * Returns: the data if it is found, otherwise NULL.
* Note, if the data pointer is used as the data itself, (see
* ftrace_func_mapper_find_ip(), then the return value may be meaningless,
* if the data pointer was set to zero.
@@ -4762,7 +5261,7 @@ static void release_probe(struct ftrace_func_probe *probe)
{
struct ftrace_probe_ops *probe_ops;
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
WARN_ON(probe->ref <= 0);
@@ -4780,7 +5279,6 @@ static void release_probe(struct ftrace_func_probe *probe)
list_del(&probe->list);
kfree(probe);
}
- mutex_unlock(&ftrace_lock);
}
static void acquire_probe_locked(struct ftrace_func_probe *probe)
@@ -5086,20 +5584,15 @@ static DEFINE_MUTEX(ftrace_cmd_mutex);
__init int register_ftrace_command(struct ftrace_func_command *cmd)
{
struct ftrace_func_command *p;
- int ret = 0;
- mutex_lock(&ftrace_cmd_mutex);
+ guard(mutex)(&ftrace_cmd_mutex);
list_for_each_entry(p, &ftrace_commands, list) {
- if (strcmp(cmd->name, p->name) == 0) {
- ret = -EBUSY;
- goto out_unlock;
- }
+ if (strcmp(cmd->name, p->name) == 0)
+ return -EBUSY;
}
list_add(&cmd->list, &ftrace_commands);
- out_unlock:
- mutex_unlock(&ftrace_cmd_mutex);
- return ret;
+ return 0;
}
/*
@@ -5109,20 +5602,17 @@ __init int register_ftrace_command(struct ftrace_func_command *cmd)
__init int unregister_ftrace_command(struct ftrace_func_command *cmd)
{
struct ftrace_func_command *p, *n;
- int ret = -ENODEV;
- mutex_lock(&ftrace_cmd_mutex);
+ guard(mutex)(&ftrace_cmd_mutex);
+
list_for_each_entry_safe(p, n, &ftrace_commands, list) {
if (strcmp(cmd->name, p->name) == 0) {
- ret = 0;
list_del_init(&p->list);
- goto out_unlock;
+ return 0;
}
}
- out_unlock:
- mutex_unlock(&ftrace_cmd_mutex);
- return ret;
+ return -ENODEV;
}
static int ftrace_process_regex(struct ftrace_iterator *iter,
@@ -5132,7 +5622,7 @@ static int ftrace_process_regex(struct ftrace_iterator *iter,
struct trace_array *tr = iter->ops->private;
char *func, *command, *next = buff;
struct ftrace_func_command *p;
- int ret = -EINVAL;
+ int ret;
func = strsep(&next, ":");
@@ -5149,17 +5639,14 @@ static int ftrace_process_regex(struct ftrace_iterator *iter,
command = strsep(&next, ":");
- mutex_lock(&ftrace_cmd_mutex);
+ guard(mutex)(&ftrace_cmd_mutex);
+
list_for_each_entry(p, &ftrace_commands, list) {
- if (strcmp(p->name, command) == 0) {
- ret = p->func(tr, hash, func, command, next, enable);
- goto out_unlock;
- }
+ if (strcmp(p->name, command) == 0)
+ return p->func(tr, hash, func, command, next, enable);
}
- out_unlock:
- mutex_unlock(&ftrace_cmd_mutex);
- return ret;
+ return -EINVAL;
}
static ssize_t
@@ -5193,12 +5680,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
parser->idx, enable);
trace_parser_clear(parser);
if (ret < 0)
- goto out;
+ return ret;
}
- ret = read;
- out:
- return ret;
+ return read;
}
ssize_t
@@ -5230,6 +5715,9 @@ __ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
return -ENOENT;
free_hash_entry(hash, entry);
return 0;
+ } else if (__ftrace_lookup_ip(hash, ip) != NULL) {
+ /* Already exists */
+ return 0;
}
entry = add_hash_entry(hash, ip);
@@ -5259,7 +5747,7 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long *ips,
static int
ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
unsigned long *ips, unsigned int cnt,
- int remove, int reset, int enable)
+ int remove, int reset, int enable, char *mod)
{
struct ftrace_hash **orig_hash;
struct ftrace_hash *hash;
@@ -5285,7 +5773,15 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
goto out_regex_unlock;
}
- if (buf && !ftrace_match_records(hash, buf, len)) {
+ if (buf && !match_records(hash, buf, len, mod)) {
+ /* If this was for a module and nothing was enabled, flag it */
+ if (mod)
+ (*orig_hash)->flags |= FTRACE_HASH_FL_MOD;
+
+ /*
+ * Even if it is a mod, return error to let caller know
+ * nothing was added
+ */
ret = -EINVAL;
goto out_regex_unlock;
}
@@ -5310,19 +5806,11 @@ static int
ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt,
int remove, int reset, int enable)
{
- return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable);
+ return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable, NULL);
}
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
-struct ftrace_direct_func {
- struct list_head next;
- unsigned long addr;
- int count;
-};
-
-static LIST_HEAD(ftrace_direct_funcs);
-
static int register_ftrace_function_nolock(struct ftrace_ops *ops);
/*
@@ -5363,6 +5851,13 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long
}
}
+static void register_ftrace_direct_cb(struct rcu_head *rhp)
+{
+ struct ftrace_hash *fhp = container_of(rhp, struct ftrace_hash, rcu);
+
+ free_ftrace_hash(fhp);
+}
+
/**
* register_ftrace_direct - Call a custom trampoline directly
* for multiple functions registered in @ops
@@ -5461,10 +5956,8 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
out_unlock:
mutex_unlock(&direct_mutex);
- if (free_hash && free_hash != EMPTY_HASH) {
- synchronize_rcu_tasks();
- free_ftrace_hash(free_hash);
- }
+ if (free_hash && free_hash != EMPTY_HASH)
+ call_rcu_tasks(&free_hash->rcu, register_ftrace_direct_cb);
if (new_hash)
free_ftrace_hash(new_hash);
@@ -5477,6 +5970,8 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct);
* unregister_ftrace_direct - Remove calls to custom trampoline
* previously registered by register_ftrace_direct for @ops object.
* @ops: The address of the struct ftrace_ops object
+ * @addr: The address of the direct function that is called by the @ops functions
+ * @free_filters: Set to true to remove all filters for the ftrace_ops, false otherwise
*
* This is used to remove a direct calls to @addr from the nop locations
* of the functions registered in @ops (with by ftrace_set_filter_ip
@@ -5625,10 +6120,10 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct);
/**
* ftrace_set_filter_ip - set a function to filter on in ftrace by address
- * @ops - the ops to set the filter with
- * @ip - the address to add to or remove from the filter.
- * @remove - non zero to remove the ip from the filter
- * @reset - non zero to reset all filters before applying this filter.
+ * @ops: the ops to set the filter with
+ * @ip: the address to add to or remove from the filter.
+ * @remove: non zero to remove the ip from the filter
+ * @reset: non zero to reset all filters before applying this filter.
*
* Filters denote which functions should be enabled when tracing is enabled
* If @ip is NULL, it fails to update filter.
@@ -5647,11 +6142,11 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
/**
* ftrace_set_filter_ips - set functions to filter on in ftrace by addresses
- * @ops - the ops to set the filter with
- * @ips - the array of addresses to add to or remove from the filter.
- * @cnt - the number of addresses in @ips
- * @remove - non zero to remove ips from the filter
- * @reset - non zero to reset all filters before applying this filter.
+ * @ops: the ops to set the filter with
+ * @ips: the array of addresses to add to or remove from the filter.
+ * @cnt: the number of addresses in @ips
+ * @remove: non zero to remove ips from the filter
+ * @reset: non zero to reset all filters before applying this filter.
*
* Filters denote which functions should be enabled when tracing is enabled
* If @ips array or any ip specified within is NULL , it fails to update filter.
@@ -5670,7 +6165,7 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter_ips);
/**
* ftrace_ops_set_global_filter - setup ops to use global filters
- * @ops - the ops which will use the global filters
+ * @ops: the ops which will use the global filters
*
* ftrace users who need global function trace filtering should call this.
* It can set the global filter only if ops were not initialized before.
@@ -5689,15 +6184,46 @@ static int
ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
int reset, int enable)
{
- return ftrace_set_hash(ops, buf, len, NULL, 0, 0, reset, enable);
+ char *mod = NULL, *func, *command, *next = buf;
+ char *tmp __free(kfree) = NULL;
+ struct trace_array *tr = ops->private;
+ int ret;
+
+ func = strsep(&next, ":");
+
+ /* This can also handle :mod: parsing */
+ if (next) {
+ if (!tr)
+ return -EINVAL;
+
+ command = strsep(&next, ":");
+ if (strcmp(command, "mod") != 0)
+ return -EINVAL;
+
+ mod = next;
+ len = command - func;
+ /* Save the original func as ftrace_set_hash() can modify it */
+ tmp = kstrdup(func, GFP_KERNEL);
+ }
+
+ ret = ftrace_set_hash(ops, func, len, NULL, 0, 0, reset, enable, mod);
+
+ if (tr && mod && ret < 0) {
+ /* Did tmp fail to allocate? */
+ if (!tmp)
+ return -ENOMEM;
+ ret = cache_mod(tr, tmp, mod, enable);
+ }
+
+ return ret;
}
/**
* ftrace_set_filter - set a function to filter on in ftrace
- * @ops - the ops to set the filter with
- * @buf - the string that holds the function filter text.
- * @len - the length of the string.
- * @reset - non zero to reset all filters before applying this filter.
+ * @ops: the ops to set the filter with
+ * @buf: the string that holds the function filter text.
+ * @len: the length of the string.
+ * @reset: non-zero to reset all filters before applying this filter.
*
* Filters denote which functions should be enabled when tracing is enabled.
* If @buf is NULL and reset is set, all functions will be enabled for tracing.
@@ -5716,10 +6242,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter);
/**
* ftrace_set_notrace - set a function to not trace in ftrace
- * @ops - the ops to set the notrace filter with
- * @buf - the string that holds the function notrace text.
- * @len - the length of the string.
- * @reset - non zero to reset all filters before applying this filter.
+ * @ops: the ops to set the notrace filter with
+ * @buf: the string that holds the function notrace text.
+ * @len: the length of the string.
+ * @reset: non-zero to reset all filters before applying this filter.
*
* Notrace Filters denote which functions should not be enabled when tracing
* is enabled. If @buf is NULL and reset is set, all functions will be enabled
@@ -5738,9 +6264,9 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
EXPORT_SYMBOL_GPL(ftrace_set_notrace);
/**
* ftrace_set_global_filter - set a function to filter on with global tracers
- * @buf - the string that holds the function filter text.
- * @len - the length of the string.
- * @reset - non zero to reset all filters before applying this filter.
+ * @buf: the string that holds the function filter text.
+ * @len: the length of the string.
+ * @reset: non-zero to reset all filters before applying this filter.
*
* Filters denote which functions should be enabled when tracing is enabled.
* If @buf is NULL and reset is set, all functions will be enabled for tracing.
@@ -5753,9 +6279,9 @@ EXPORT_SYMBOL_GPL(ftrace_set_global_filter);
/**
* ftrace_set_global_notrace - set a function to not trace with global tracers
- * @buf - the string that holds the function notrace text.
- * @len - the length of the string.
- * @reset - non zero to reset all filters before applying this filter.
+ * @buf: the string that holds the function notrace text.
+ * @len: the length of the string.
+ * @reset: non-zero to reset all filters before applying this filter.
*
* Notrace Filters denote which functions should not be enabled when tracing
* is enabled. If @buf is NULL and reset is set, all functions will be enabled
@@ -5814,9 +6340,8 @@ __setup("ftrace_graph_notrace=", set_graph_notrace_function);
static int __init set_graph_max_depth_function(char *str)
{
- if (!str)
+ if (!str || kstrtouint(str, 0, &fgraph_max_depth))
return 0;
- fgraph_max_depth = simple_strtoul(str, NULL, 0);
return 1;
}
__setup("ftrace_graph_max_depth=", set_graph_max_depth_function);
@@ -5854,6 +6379,14 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)
ftrace_ops_init(ops);
+ /* The trace_array is needed for caching module function filters */
+ if (!ops->private) {
+ struct trace_array *tr = trace_get_global_array();
+
+ ops->private = tr;
+ ftrace_init_trace_array(tr);
+ }
+
while (buf) {
func = strsep(&buf, ",");
ftrace_set_regex(ops, func, strlen(func), 0, enable);
@@ -6293,12 +6826,10 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer)
func_g.len = strlen(func_g.search);
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
- if (unlikely(ftrace_disabled)) {
- mutex_unlock(&ftrace_lock);
+ if (unlikely(ftrace_disabled))
return -ENODEV;
- }
do_for_each_ftrace_rec(pg, rec) {
@@ -6314,7 +6845,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer)
if (entry)
continue;
if (add_hash_entry(hash, rec->ip) == NULL)
- goto out;
+ return 0;
} else {
if (entry) {
free_hash_entry(hash, entry);
@@ -6323,13 +6854,8 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer)
}
}
} while_for_each_ftrace_rec();
-out:
- mutex_unlock(&ftrace_lock);
-
- if (fail)
- return -EINVAL;
- return 0;
+ return fail ? -EINVAL : 0;
}
static ssize_t
@@ -6593,6 +7119,8 @@ static int ftrace_process_locs(struct module *mod,
/* We should have used all pages unless we skipped some */
if (pg_unuse) {
WARN_ON(!skipped);
+ /* Need to synchronize with ftrace_location_range() */
+ synchronize_rcu();
ftrace_free_pages(pg_unuse);
}
return ret;
@@ -6806,6 +7334,9 @@ void ftrace_release_mod(struct module *mod)
out_unlock:
mutex_unlock(&ftrace_lock);
+ /* Need to synchronize with ftrace_location_range() */
+ if (tmp_page)
+ synchronize_rcu();
for (pg = tmp_page; pg; pg = tmp_page) {
/* Needs to be called outside of ftrace_lock */
@@ -6967,7 +7498,7 @@ allocate_ftrace_mod_map(struct module *mod,
return mod_map;
}
-static const char *
+static int
ftrace_func_address_lookup(struct ftrace_mod_map *mod_map,
unsigned long addr, unsigned long *size,
unsigned long *off, char *sym)
@@ -6988,21 +7519,18 @@ ftrace_func_address_lookup(struct ftrace_mod_map *mod_map,
*size = found_func->size;
if (off)
*off = addr - found_func->ip;
- if (sym)
- strscpy(sym, found_func->name, KSYM_NAME_LEN);
-
- return found_func->name;
+ return strscpy(sym, found_func->name, KSYM_NAME_LEN);
}
- return NULL;
+ return 0;
}
-const char *
+int
ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
unsigned long *off, char **modname, char *sym)
{
struct ftrace_mod_map *mod_map;
- const char *ret = NULL;
+ int ret = 0;
/* mod_map is freed via call_rcu() */
preempt_disable();
@@ -7139,6 +7667,7 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
unsigned long start = (unsigned long)(start_ptr);
unsigned long end = (unsigned long)(end_ptr);
struct ftrace_page **last_pg = &ftrace_pages_start;
+ struct ftrace_page *tmp_page = NULL;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
struct dyn_ftrace key;
@@ -7180,12 +7709,8 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
ftrace_update_tot_cnt--;
if (!pg->index) {
*last_pg = pg->next;
- if (pg->records) {
- free_pages((unsigned long)pg->records, pg->order);
- ftrace_number_of_pages -= 1 << pg->order;
- }
- ftrace_number_of_groups--;
- kfree(pg);
+ pg->next = tmp_page;
+ tmp_page = pg;
pg = container_of(last_pg, struct ftrace_page, next);
if (!(*last_pg))
ftrace_pages = pg;
@@ -7202,6 +7727,11 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
clear_func_from_hashes(func);
kfree(func);
}
+ /* Need to synchronize with ftrace_location_range() */
+ if (tmp_page) {
+ synchronize_rcu();
+ ftrace_free_pages(tmp_page);
+ }
}
void __init ftrace_free_init_mem(void)
@@ -7290,9 +7820,14 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops)
void ftrace_init_trace_array(struct trace_array *tr)
{
+ if (tr->flags & TRACE_ARRAY_FL_MOD_INIT)
+ return;
+
INIT_LIST_HEAD(&tr->func_probes);
INIT_LIST_HEAD(&tr->mod_trace);
INIT_LIST_HEAD(&tr->mod_notrace);
+
+ tr->flags |= TRACE_ARRAY_FL_MOD_INIT;
}
#else
@@ -7321,8 +7856,10 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops)
__init void ftrace_init_global_array_ops(struct trace_array *tr)
{
tr->ops = &global_ops;
- tr->ops->private = tr;
+ if (!global_ops.private)
+ global_ops.private = tr;
ftrace_init_trace_array(tr);
+ init_array_fgraph_ops(tr, tr->ops);
}
void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
@@ -7403,6 +7940,7 @@ out:
void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs)
{
+ kmsan_unpoison_memory(fregs, ftrace_regs_size());
__ftrace_ops_list_func(ip, parent_ip, NULL, fregs);
}
#else
@@ -7443,7 +7981,7 @@ NOKPROBE_SYMBOL(ftrace_ops_assist_func);
* have its own recursion protection, then it should call the
* ftrace_ops_assist_func() instead.
*
- * Returns the function that the trampoline should call for @ops.
+ * Returns: the function that the trampoline should call for @ops.
*/
ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
{
@@ -7761,7 +8299,7 @@ pid_write(struct file *filp, const char __user *ubuf,
if (!cnt)
return 0;
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
switch (type) {
case TRACE_PIDS:
@@ -7777,14 +8315,13 @@ pid_write(struct file *filp, const char __user *ubuf,
lockdep_is_held(&ftrace_lock));
break;
default:
- ret = -EINVAL;
WARN_ON_ONCE(1);
- goto out;
+ return -EINVAL;
}
ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
if (ret < 0)
- goto out;
+ return ret;
switch (type) {
case TRACE_PIDS:
@@ -7813,11 +8350,8 @@ pid_write(struct file *filp, const char __user *ubuf,
ftrace_update_pid_func();
ftrace_startup_all(0);
- out:
- mutex_unlock(&ftrace_lock);
- if (ret > 0)
- *ppos += ret;
+ *ppos += ret;
return ret;
}
@@ -7892,12 +8426,13 @@ void ftrace_kill(void)
ftrace_disabled = 1;
ftrace_enabled = 0;
ftrace_trace_function = ftrace_stub;
+ kprobe_ftrace_kill();
}
/**
* ftrace_is_dead - Test if ftrace is dead or not.
*
- * Returns 1 if ftrace is "dead", zero otherwise.
+ * Returns: 1 if ftrace is "dead", zero otherwise.
*/
int ftrace_is_dead(void)
{
@@ -8142,8 +8677,7 @@ static int kallsyms_callback(void *data, const char *name, unsigned long addr)
* @addrs array, which needs to be big enough to store at least @cnt
* addresses.
*
- * This function returns 0 if all provided symbols are found,
- * -ESRCH otherwise.
+ * Returns: 0 if all provided symbols are found, -ESRCH otherwise.
*/
int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs)
{
@@ -8217,20 +8751,20 @@ static bool is_permanent_ops_registered(void)
}
static int
-ftrace_enable_sysctl(struct ctl_table *table, int write,
+ftrace_enable_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- int ret = -ENODEV;
+ int ret;
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
if (unlikely(ftrace_disabled))
- goto out;
+ return -ENODEV;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
- goto out;
+ return ret;
if (ftrace_enabled) {
@@ -8244,8 +8778,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
} else {
if (is_permanent_ops_registered()) {
ftrace_enabled = true;
- ret = -EBUSY;
- goto out;
+ return -EBUSY;
}
/* stopping ftrace calls (just send to ftrace_stub) */
@@ -8255,12 +8788,10 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
}
last_ftrace_enabled = !!ftrace_enabled;
- out:
- mutex_unlock(&ftrace_lock);
- return ret;
+ return 0;
}
-static struct ctl_table ftrace_sysctls[] = {
+static const struct ctl_table ftrace_sysctls[] = {
{
.procname = "ftrace_enabled",
.data = &ftrace_enabled,
@@ -8268,7 +8799,6 @@ static struct ctl_table ftrace_sysctls[] = {
.mode = 0644,
.proc_handler = ftrace_enable_sysctl,
},
- {}
};
static int __init ftrace_sysctl_init(void)
diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h
index 5012c04f92c0..3235470e61b3 100644
--- a/kernel/trace/ftrace_internal.h
+++ b/kernel/trace/ftrace_internal.h
@@ -15,6 +15,8 @@ extern struct ftrace_ops global_ops;
int ftrace_startup(struct ftrace_ops *ops, int command);
int ftrace_shutdown(struct ftrace_ops *ops, int command);
int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs);
+int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command);
+int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command);
#else /* !CONFIG_DYNAMIC_FTRACE */
@@ -38,14 +40,26 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
{
return 1;
}
+static inline int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command)
+{
+ return -EINVAL;
+}
+static inline int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command)
+{
+ return -EINVAL;
+}
#endif /* CONFIG_DYNAMIC_FTRACE */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
extern int ftrace_graph_active;
-void update_function_graph_func(void);
+# ifdef CONFIG_DYNAMIC_FTRACE
+extern void fgraph_update_pid_func(void);
+# else
+static inline void fgraph_update_pid_func(void) {}
+# endif
#else /* !CONFIG_FUNCTION_GRAPH_TRACER */
# define ftrace_graph_active 0
-static inline void update_function_graph_func(void) { }
+static inline void fgraph_update_pid_func(void) {}
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
#else /* !CONFIG_FUNCTION_TRACER */
diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index 95106d02b32d..c62b9b3cfb3d 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -354,7 +354,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
while (upper_count-- > 0) {
union upper_chunk *chunk;
- chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+ chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT);
if (!chunk)
break;
*upper_next = chunk;
@@ -365,7 +365,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
while (lower_count-- > 0) {
union lower_chunk *chunk;
- chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+ chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT);
if (!chunk)
break;
*lower_next = chunk;
@@ -414,7 +414,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
int i;
/* According to linux/thread.h, pids can be no bigger that 30 bits */
- WARN_ON_ONCE(pid_max > (1 << 30));
+ WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30));
pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL);
if (!pid_list)
@@ -451,6 +451,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
/**
* trace_pid_list_free - Frees an allocated pid_list.
+ * @pid_list: The pid list to free.
*
* Frees the memory for a pid_list that was allocated.
*/
diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c
index 8c4ffd076162..314ffc143039 100644
--- a/kernel/trace/preemptirq_delay_test.c
+++ b/kernel/trace/preemptirq_delay_test.c
@@ -34,8 +34,6 @@ MODULE_PARM_DESC(cpu_affinity, "Cpu num test is running on");
static struct completion done;
-#define MIN(x, y) ((x) < (y) ? (x) : (y))
-
static void busy_wait(ulong time)
{
u64 start, end;
@@ -215,4 +213,5 @@ static void __exit preemptirq_delay_exit(void)
module_init(preemptirq_delay_init)
module_exit(preemptirq_delay_exit)
+MODULE_DESCRIPTION("Preempt / IRQ disable delay thread to test latency tracers");
MODULE_LICENSE("GPL v2");
diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
index fa03094e9e69..30d224946881 100644
--- a/kernel/trace/rethook.c
+++ b/kernel/trace/rethook.c
@@ -166,6 +166,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh)
if (unlikely(!handler))
return NULL;
+#if defined(CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING) || defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE)
/*
* This expects the caller will set up a rethook on a function entry.
* When the function returns, the rethook will eventually be reclaimed
@@ -174,6 +175,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh)
*/
if (unlikely(!rcu_is_watching()))
return NULL;
+#endif
return (struct rethook_node *)objpool_pop(&rh->pool);
}
@@ -248,7 +250,7 @@ unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame
if (WARN_ON_ONCE(!cur))
return 0;
- if (WARN_ON_ONCE(tsk != current && task_is_running(tsk)))
+ if (tsk != current && task_is_running(tsk))
return 0;
do {
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index aa332ace108b..bb6089c2951e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -9,6 +9,7 @@
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
#include <linux/sched/clock.h>
+#include <linux/cacheflush.h>
#include <linux/trace_seq.h>
#include <linux/spinlock.h>
#include <linux/irq_work.h>
@@ -26,10 +27,13 @@
#include <linux/list.h>
#include <linux/cpu.h>
#include <linux/oom.h>
+#include <linux/mm.h>
#include <asm/local64.h>
#include <asm/local.h>
+#include "trace.h"
+
/*
* The "absolute" timestamp in the buffer is only 59 bits.
* If a clock has the 5 MSBs set, it needs to be saved and
@@ -40,6 +44,21 @@
static void update_pages_handler(struct work_struct *work);
+#define RING_BUFFER_META_MAGIC 0xBADFEED
+
+struct ring_buffer_meta {
+ int magic;
+ int struct_size;
+ unsigned long text_addr;
+ unsigned long data_addr;
+ unsigned long first_buffer;
+ unsigned long head_buffer;
+ unsigned long commit_buffer;
+ __u32 subbuf_size;
+ __u32 nr_subbufs;
+ int buffers[];
+};
+
/*
* The ring buffer header is special. We must manually up keep it.
*/
@@ -312,6 +331,8 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event)
/* Missed count stored at end */
#define RB_MISSED_STORED (1 << 30)
+#define RB_MISSED_MASK (3 << 30)
+
struct buffer_data_page {
u64 time_stamp; /* page time stamp */
local_t commit; /* write committed index */
@@ -338,6 +359,8 @@ struct buffer_page {
local_t entries; /* entries on this page */
unsigned long real_end; /* real end of data */
unsigned order; /* order of the page */
+ u32 id:30; /* ID for external mapping */
+ u32 range:1; /* Mapped via a range */
struct buffer_data_page *page; /* Actual data page */
};
@@ -368,7 +391,9 @@ static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
static void free_buffer_page(struct buffer_page *bpage)
{
- free_pages((unsigned long)bpage->page, bpage->order);
+ /* Range pages are not to be freed */
+ if (!bpage->range)
+ free_pages((unsigned long)bpage->page, bpage->order);
kfree(bpage);
}
@@ -384,6 +409,7 @@ struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
wait_queue_head_t full_waiters;
+ atomic_t seq;
bool waiters_pending;
bool full_waiters_pending;
bool wakeup_full;
@@ -456,6 +482,8 @@ struct ring_buffer_per_cpu {
unsigned long nr_pages;
unsigned int current_context;
struct list_head *pages;
+ /* pages generation counter, incremented when the list changes */
+ unsigned long cnt;
struct buffer_page *head_page; /* read from head */
struct buffer_page *tail_page; /* write to tail */
struct buffer_page *commit_page; /* committed pages */
@@ -483,6 +511,14 @@ struct ring_buffer_per_cpu {
u64 read_stamp;
/* pages removed since last reset */
unsigned long pages_removed;
+
+ unsigned int mapped;
+ unsigned int user_mapped; /* user space mapping */
+ struct mutex mapping_lock;
+ unsigned long *subbuf_ids; /* ID to subbuf VA */
+ struct trace_buffer_meta *meta_page;
+ struct ring_buffer_meta *ring_meta;
+
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
struct list_head new_pages; /* new pages to add */
@@ -511,6 +547,12 @@ struct trace_buffer {
struct rb_irq_work irq_work;
bool time_stamp_abs;
+ unsigned long range_addr_start;
+ unsigned long range_addr_end;
+
+ long last_text_delta;
+ long last_data_delta;
+
unsigned int subbuf_size;
unsigned int subbuf_order;
unsigned int max_data_size;
@@ -681,18 +723,6 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer,
}
/**
- * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer
- * @buffer: The ring_buffer to get the number of pages from
- * @cpu: The cpu of the ring_buffer to get the number of pages from
- *
- * Returns the number of pages used by a per_cpu buffer of the ring buffer.
- */
-size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu)
-{
- return buffer->buffers[cpu]->nr_pages;
-}
-
-/**
* ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer
* @buffer: The ring_buffer to get the number of pages from
* @cpu: The cpu of the ring_buffer to get the number of pages from
@@ -753,6 +783,9 @@ static void rb_wake_up_waiters(struct irq_work *work)
{
struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
+ /* For waiters waiting for the first wake up */
+ (void)atomic_fetch_inc_release(&rbwork->seq);
+
wake_up_all(&rbwork->waiters);
if (rbwork->full_waiters_pending || rbwork->wakeup_full) {
/* Only cpu_buffer sets the above flags */
@@ -834,51 +867,24 @@ static bool rb_watermark_hit(struct trace_buffer *buffer, int cpu, int full)
pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
ret = !pagebusy && full_hit(buffer, cpu, full);
- if (!cpu_buffer->shortest_full ||
- cpu_buffer->shortest_full > full)
- cpu_buffer->shortest_full = full;
+ if (!ret && (!cpu_buffer->shortest_full ||
+ cpu_buffer->shortest_full > full)) {
+ cpu_buffer->shortest_full = full;
+ }
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
}
return ret;
}
-/**
- * ring_buffer_wait - wait for input to the ring buffer
- * @buffer: buffer to wait on
- * @cpu: the cpu buffer to wait on
- * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS
- *
- * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
- * as data is added to any of the @buffer's cpu buffers. Otherwise
- * it will wait for data to be added to a specific cpu buffer.
- */
-int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
+static inline bool
+rb_wait_cond(struct rb_irq_work *rbwork, struct trace_buffer *buffer,
+ int cpu, int full, ring_buffer_cond_fn cond, void *data)
{
- struct ring_buffer_per_cpu *cpu_buffer;
- DEFINE_WAIT(wait);
- struct rb_irq_work *work;
- int ret = 0;
-
- /*
- * Depending on what the caller is waiting for, either any
- * data in any cpu buffer, or a specific buffer, put the
- * caller on the appropriate wait queue.
- */
- if (cpu == RING_BUFFER_ALL_CPUS) {
- work = &buffer->irq_work;
- /* Full only makes sense on per cpu reads */
- full = 0;
- } else {
- if (!cpumask_test_cpu(cpu, buffer->cpumask))
- return -ENODEV;
- cpu_buffer = buffer->buffers[cpu];
- work = &cpu_buffer->irq_work;
- }
+ if (rb_watermark_hit(buffer, cpu, full))
+ return true;
- if (full)
- prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
- else
- prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
+ if (cond(data))
+ return true;
/*
* The events can happen in critical sections where
@@ -901,27 +907,82 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
* a task has been queued. It's OK for spurious wake ups.
*/
if (full)
- work->full_waiters_pending = true;
+ rbwork->full_waiters_pending = true;
else
- work->waiters_pending = true;
+ rbwork->waiters_pending = true;
- if (rb_watermark_hit(buffer, cpu, full))
- goto out;
+ return false;
+}
- if (signal_pending(current)) {
- ret = -EINTR;
- goto out;
+struct rb_wait_data {
+ struct rb_irq_work *irq_work;
+ int seq;
+};
+
+/*
+ * The default wait condition for ring_buffer_wait() is to just to exit the
+ * wait loop the first time it is woken up.
+ */
+static bool rb_wait_once(void *data)
+{
+ struct rb_wait_data *rdata = data;
+ struct rb_irq_work *rbwork = rdata->irq_work;
+
+ return atomic_read_acquire(&rbwork->seq) != rdata->seq;
+}
+
+/**
+ * ring_buffer_wait - wait for input to the ring buffer
+ * @buffer: buffer to wait on
+ * @cpu: the cpu buffer to wait on
+ * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS
+ * @cond: condition function to break out of wait (NULL to run once)
+ * @data: the data to pass to @cond.
+ *
+ * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
+ * as data is added to any of the @buffer's cpu buffers. Otherwise
+ * it will wait for data to be added to a specific cpu buffer.
+ */
+int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full,
+ ring_buffer_cond_fn cond, void *data)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct wait_queue_head *waitq;
+ struct rb_irq_work *rbwork;
+ struct rb_wait_data rdata;
+ int ret = 0;
+
+ /*
+ * Depending on what the caller is waiting for, either any
+ * data in any cpu buffer, or a specific buffer, put the
+ * caller on the appropriate wait queue.
+ */
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ rbwork = &buffer->irq_work;
+ /* Full only makes sense on per cpu reads */
+ full = 0;
+ } else {
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -ENODEV;
+ cpu_buffer = buffer->buffers[cpu];
+ rbwork = &cpu_buffer->irq_work;
}
- schedule();
- out:
if (full)
- finish_wait(&work->full_waiters, &wait);
+ waitq = &rbwork->full_waiters;
else
- finish_wait(&work->waiters, &wait);
+ waitq = &rbwork->waiters;
+
+ /* Set up to exit loop as soon as it is woken */
+ if (!cond) {
+ cond = rb_wait_once;
+ rdata.irq_work = rbwork;
+ rdata.seq = atomic_read_acquire(&rbwork->seq);
+ data = &rdata;
+ }
- if (!ret && !rb_watermark_hit(buffer, cpu, full) && signal_pending(current))
- ret = -EINTR;
+ ret = wait_event_interruptible((*waitq),
+ rb_wait_cond(rbwork, buffer, cpu, full, cond, data));
return ret;
}
@@ -959,21 +1020,30 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
}
if (full) {
- unsigned long flags;
-
poll_wait(filp, &rbwork->full_waiters, poll_table);
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ if (rb_watermark_hit(buffer, cpu, full))
+ return EPOLLIN | EPOLLRDNORM;
+ /*
+ * Only allow full_waiters_pending update to be seen after
+ * the shortest_full is set (in rb_watermark_hit). If the
+ * writer sees the full_waiters_pending flag set, it will
+ * compare the amount in the ring buffer to shortest_full.
+ * If the amount in the ring buffer is greater than the
+ * shortest_full percent, it will call the irq_work handler
+ * to wake up this list. The irq_handler will reset shortest_full
+ * back to zero. That's done under the reader_lock, but
+ * the below smp_mb() makes sure that the update to
+ * full_waiters_pending doesn't leak up into the above.
+ */
+ smp_mb();
rbwork->full_waiters_pending = true;
- if (!cpu_buffer->shortest_full ||
- cpu_buffer->shortest_full > full)
- cpu_buffer->shortest_full = full;
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
- } else {
- poll_wait(filp, &rbwork->waiters, poll_table);
- rbwork->waiters_pending = true;
+ return 0;
}
+ poll_wait(filp, &rbwork->waiters, poll_table);
+ rbwork->waiters_pending = true;
+
/*
* There's a tight race between setting the waiters_pending and
* checking if the ring buffer is empty. Once the waiters_pending bit
@@ -989,9 +1059,6 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
*/
smp_mb();
- if (full)
- return full_hit(buffer, cpu, full) ? EPOLLIN | EPOLLRDNORM : 0;
-
if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
(cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
return EPOLLIN | EPOLLRDNORM;
@@ -1022,7 +1089,7 @@ static inline u64 rb_time_stamp(struct trace_buffer *buffer)
u64 ts;
/* Skip retpolines :-( */
- if (IS_ENABLED(CONFIG_RETPOLINE) && likely(buffer->clock == trace_clock_local))
+ if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && likely(buffer->clock == trace_clock_local))
ts = trace_clock_local();
else
ts = buffer->clock();
@@ -1202,6 +1269,11 @@ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
* Set the previous list pointer to have the HEAD flag.
*/
rb_set_list_to_head(head->list.prev);
+
+ if (cpu_buffer->ring_meta) {
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ meta->head_buffer = (unsigned long)head->page;
+ }
}
static void rb_list_head_clear(struct list_head *list)
@@ -1355,7 +1427,6 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
- local_inc(&cpu_buffer->pages_touched);
/*
* Just make sure we have seen our old_write and synchronize
* with any interrupts that come in.
@@ -1392,8 +1463,9 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
*/
local_set(&next_page->page->commit, 0);
- /* Again, either we update tail_page or an interrupt does */
- (void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page);
+ /* Either we update tail_page or an interrupt does */
+ if (try_cmpxchg(&cpu_buffer->tail_page, &tail_page, next_page))
+ local_inc(&cpu_buffer->pages_touched);
}
}
@@ -1405,6 +1477,20 @@ static void rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK);
}
+static bool rb_check_links(struct ring_buffer_per_cpu *cpu_buffer,
+ struct list_head *list)
+{
+ if (RB_WARN_ON(cpu_buffer,
+ rb_list_head(rb_list_head(list->next)->prev) != list))
+ return false;
+
+ if (RB_WARN_ON(cpu_buffer,
+ rb_list_head(rb_list_head(list->prev)->next) != list))
+ return false;
+
+ return true;
+}
+
/**
* rb_check_pages - integrity check of buffer pages
* @cpu_buffer: CPU buffer with pages to test
@@ -1414,31 +1500,567 @@ static void rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
*/
static void rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
- struct list_head *head = rb_list_head(cpu_buffer->pages);
- struct list_head *tmp;
+ struct list_head *head, *tmp;
+ unsigned long buffer_cnt;
+ unsigned long flags;
+ int nr_loops = 0;
- if (RB_WARN_ON(cpu_buffer,
- rb_list_head(rb_list_head(head->next)->prev) != head))
+ /*
+ * Walk the linked list underpinning the ring buffer and validate all
+ * its next and prev links.
+ *
+ * The check acquires the reader_lock to avoid concurrent processing
+ * with code that could be modifying the list. However, the lock cannot
+ * be held for the entire duration of the walk, as this would make the
+ * time when interrupts are disabled non-deterministic, dependent on the
+ * ring buffer size. Therefore, the code releases and re-acquires the
+ * lock after checking each page. The ring_buffer_per_cpu.cnt variable
+ * is then used to detect if the list was modified while the lock was
+ * not held, in which case the check needs to be restarted.
+ *
+ * The code attempts to perform the check at most three times before
+ * giving up. This is acceptable because this is only a self-validation
+ * to detect problems early on. In practice, the list modification
+ * operations are fairly spaced, and so this check typically succeeds at
+ * most on the second try.
+ */
+again:
+ if (++nr_loops > 3)
return;
- if (RB_WARN_ON(cpu_buffer,
- rb_list_head(rb_list_head(head->prev)->next) != head))
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ head = rb_list_head(cpu_buffer->pages);
+ if (!rb_check_links(cpu_buffer, head))
+ goto out_locked;
+ buffer_cnt = cpu_buffer->cnt;
+ tmp = head;
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ while (true) {
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+ if (buffer_cnt != cpu_buffer->cnt) {
+ /* The list was updated, try again. */
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ goto again;
+ }
+
+ tmp = rb_list_head(tmp->next);
+ if (tmp == head)
+ /* The iteration circled back, all is done. */
+ goto out_locked;
+
+ if (!rb_check_links(cpu_buffer, tmp))
+ goto out_locked;
+
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ }
+
+out_locked:
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+}
+
+/*
+ * Take an address, add the meta data size as well as the array of
+ * array subbuffer indexes, then align it to a subbuffer size.
+ *
+ * This is used to help find the next per cpu subbuffer within a mapped range.
+ */
+static unsigned long
+rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs)
+{
+ addr += sizeof(struct ring_buffer_meta) +
+ sizeof(int) * nr_subbufs;
+ return ALIGN(addr, subbuf_size);
+}
+
+/*
+ * Return the ring_buffer_meta for a given @cpu.
+ */
+static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu)
+{
+ int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+ unsigned long ptr = buffer->range_addr_start;
+ struct ring_buffer_meta *meta;
+ int nr_subbufs;
+
+ if (!ptr)
+ return NULL;
+
+ /* When nr_pages passed in is zero, the first meta has already been initialized */
+ if (!nr_pages) {
+ meta = (struct ring_buffer_meta *)ptr;
+ nr_subbufs = meta->nr_subbufs;
+ } else {
+ meta = NULL;
+ /* Include the reader page */
+ nr_subbufs = nr_pages + 1;
+ }
+
+ /*
+ * The first chunk may not be subbuffer aligned, where as
+ * the rest of the chunks are.
+ */
+ if (cpu) {
+ ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs);
+ ptr += subbuf_size * nr_subbufs;
+
+ /* We can use multiplication to find chunks greater than 1 */
+ if (cpu > 1) {
+ unsigned long size;
+ unsigned long p;
+
+ /* Save the beginning of this CPU chunk */
+ p = ptr;
+ ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs);
+ ptr += subbuf_size * nr_subbufs;
+
+ /* Now all chunks after this are the same size */
+ size = ptr - p;
+ ptr += size * (cpu - 2);
+ }
+ }
+ return (void *)ptr;
+}
+
+/* Return the start of subbufs given the meta pointer */
+static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta)
+{
+ int subbuf_size = meta->subbuf_size;
+ unsigned long ptr;
+
+ ptr = (unsigned long)meta;
+ ptr = rb_range_align_subbuf(ptr, subbuf_size, meta->nr_subbufs);
+
+ return (void *)ptr;
+}
+
+/*
+ * Return a specific sub-buffer for a given @cpu defined by @idx.
+ */
+static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx)
+{
+ struct ring_buffer_meta *meta;
+ unsigned long ptr;
+ int subbuf_size;
+
+ meta = rb_range_meta(cpu_buffer->buffer, 0, cpu_buffer->cpu);
+ if (!meta)
+ return NULL;
+
+ if (WARN_ON_ONCE(idx >= meta->nr_subbufs))
+ return NULL;
+
+ subbuf_size = meta->subbuf_size;
+
+ /* Map this buffer to the order that's in meta->buffers[] */
+ idx = meta->buffers[idx];
+
+ ptr = (unsigned long)rb_subbufs_from_meta(meta);
+
+ ptr += subbuf_size * idx;
+ if (ptr + subbuf_size > cpu_buffer->buffer->range_addr_end)
+ return NULL;
+
+ return (void *)ptr;
+}
+
+/*
+ * See if the existing memory contains valid ring buffer data.
+ * As the previous kernel must be the same as this kernel, all
+ * the calculations (size of buffers and number of buffers)
+ * must be the same.
+ */
+static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
+ struct trace_buffer *buffer, int nr_pages,
+ unsigned long *subbuf_mask)
+{
+ int subbuf_size = PAGE_SIZE;
+ struct buffer_data_page *subbuf;
+ unsigned long buffers_start;
+ unsigned long buffers_end;
+ int i;
+
+ if (!subbuf_mask)
+ return false;
+
+ /* Check the meta magic and meta struct size */
+ if (meta->magic != RING_BUFFER_META_MAGIC ||
+ meta->struct_size != sizeof(*meta)) {
+ pr_info("Ring buffer boot meta[%d] mismatch of magic or struct size\n", cpu);
+ return false;
+ }
+
+ /* The subbuffer's size and number of subbuffers must match */
+ if (meta->subbuf_size != subbuf_size ||
+ meta->nr_subbufs != nr_pages + 1) {
+ pr_info("Ring buffer boot meta [%d] mismatch of subbuf_size/nr_pages\n", cpu);
+ return false;
+ }
+
+ buffers_start = meta->first_buffer;
+ buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs);
+
+ /* Is the head and commit buffers within the range of buffers? */
+ if (meta->head_buffer < buffers_start ||
+ meta->head_buffer >= buffers_end) {
+ pr_info("Ring buffer boot meta [%d] head buffer out of range\n", cpu);
+ return false;
+ }
+
+ if (meta->commit_buffer < buffers_start ||
+ meta->commit_buffer >= buffers_end) {
+ pr_info("Ring buffer boot meta [%d] commit buffer out of range\n", cpu);
+ return false;
+ }
+
+ subbuf = rb_subbufs_from_meta(meta);
+
+ bitmap_clear(subbuf_mask, 0, meta->nr_subbufs);
+
+ /* Is the meta buffers and the subbufs themselves have correct data? */
+ for (i = 0; i < meta->nr_subbufs; i++) {
+ if (meta->buffers[i] < 0 ||
+ meta->buffers[i] >= meta->nr_subbufs) {
+ pr_info("Ring buffer boot meta [%d] array out of range\n", cpu);
+ return false;
+ }
+
+ if ((unsigned)local_read(&subbuf->commit) > subbuf_size) {
+ pr_info("Ring buffer boot meta [%d] buffer invalid commit\n", cpu);
+ return false;
+ }
+
+ if (test_bit(meta->buffers[i], subbuf_mask)) {
+ pr_info("Ring buffer boot meta [%d] array has duplicates\n", cpu);
+ return false;
+ }
+
+ set_bit(meta->buffers[i], subbuf_mask);
+ subbuf = (void *)subbuf + subbuf_size;
+ }
+
+ return true;
+}
+
+static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf);
+
+static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu,
+ unsigned long long *timestamp, u64 *delta_ptr)
+{
+ struct ring_buffer_event *event;
+ u64 ts, delta;
+ int events = 0;
+ int e;
+
+ *delta_ptr = 0;
+ *timestamp = 0;
+
+ ts = dpage->time_stamp;
+
+ for (e = 0; e < tail; e += rb_event_length(event)) {
+
+ event = (struct ring_buffer_event *)(dpage->data + e);
+
+ switch (event->type_len) {
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ delta = rb_event_time_stamp(event);
+ ts += delta;
+ break;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ delta = rb_event_time_stamp(event);
+ delta = rb_fix_abs_ts(delta, ts);
+ if (delta < ts) {
+ *delta_ptr = delta;
+ *timestamp = ts;
+ return -1;
+ }
+ ts = delta;
+ break;
+
+ case RINGBUF_TYPE_PADDING:
+ if (event->time_delta == 1)
+ break;
+ fallthrough;
+ case RINGBUF_TYPE_DATA:
+ events++;
+ ts += event->time_delta;
+ break;
+
+ default:
+ return -1;
+ }
+ }
+ *timestamp = ts;
+ return events;
+}
+
+static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
+{
+ unsigned long long ts;
+ u64 delta;
+ int tail;
+
+ tail = local_read(&dpage->commit);
+ return rb_read_data_buffer(dpage, tail, cpu, &ts, &delta);
+}
+
+/* If the meta data has been validated, now validate the events */
+static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct buffer_page *head_page;
+ unsigned long entry_bytes = 0;
+ unsigned long entries = 0;
+ int ret;
+ int i;
+
+ if (!meta || !meta->head_buffer)
return;
- for (tmp = rb_list_head(head->next); tmp != head; tmp = rb_list_head(tmp->next)) {
- if (RB_WARN_ON(cpu_buffer,
- rb_list_head(rb_list_head(tmp->next)->prev) != tmp))
- return;
+ /* Do the reader page first */
+ ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu);
+ if (ret < 0) {
+ pr_info("Ring buffer reader page is invalid\n");
+ goto invalid;
+ }
+ entries += ret;
+ entry_bytes += local_read(&cpu_buffer->reader_page->page->commit);
+ local_set(&cpu_buffer->reader_page->entries, ret);
- if (RB_WARN_ON(cpu_buffer,
- rb_list_head(rb_list_head(tmp->prev)->next) != tmp))
- return;
+ head_page = cpu_buffer->head_page;
+
+ /* If both the head and commit are on the reader_page then we are done. */
+ if (head_page == cpu_buffer->reader_page &&
+ head_page == cpu_buffer->commit_page)
+ goto done;
+
+ /* Iterate until finding the commit page */
+ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) {
+
+ /* Reader page has already been done */
+ if (head_page == cpu_buffer->reader_page)
+ continue;
+
+ ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
+ if (ret < 0) {
+ pr_info("Ring buffer meta [%d] invalid buffer page\n",
+ cpu_buffer->cpu);
+ goto invalid;
+ }
+
+ /* If the buffer has content, update pages_touched */
+ if (ret)
+ local_inc(&cpu_buffer->pages_touched);
+
+ entries += ret;
+ entry_bytes += local_read(&head_page->page->commit);
+ local_set(&cpu_buffer->head_page->entries, ret);
+
+ if (head_page == cpu_buffer->commit_page)
+ break;
+ }
+
+ if (head_page != cpu_buffer->commit_page) {
+ pr_info("Ring buffer meta [%d] commit page not found\n",
+ cpu_buffer->cpu);
+ goto invalid;
+ }
+ done:
+ local_set(&cpu_buffer->entries, entries);
+ local_set(&cpu_buffer->entries_bytes, entry_bytes);
+
+ pr_info("Ring buffer meta [%d] is from previous boot!\n", cpu_buffer->cpu);
+ return;
+
+ invalid:
+ /* The content of the buffers are invalid, reset the meta data */
+ meta->head_buffer = 0;
+ meta->commit_buffer = 0;
+
+ /* Reset the reader page */
+ local_set(&cpu_buffer->reader_page->entries, 0);
+ local_set(&cpu_buffer->reader_page->page->commit, 0);
+
+ /* Reset all the subbuffers */
+ for (i = 0; i < meta->nr_subbufs - 1; i++, rb_inc_page(&head_page)) {
+ local_set(&head_page->entries, 0);
+ local_set(&head_page->page->commit, 0);
+ }
+}
+
+/* Used to calculate data delta */
+static char rb_data_ptr[] = "";
+
+#define THIS_TEXT_PTR ((unsigned long)rb_meta_init_text_addr)
+#define THIS_DATA_PTR ((unsigned long)rb_data_ptr)
+
+static void rb_meta_init_text_addr(struct ring_buffer_meta *meta)
+{
+ meta->text_addr = THIS_TEXT_PTR;
+ meta->data_addr = THIS_DATA_PTR;
+}
+
+static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
+{
+ struct ring_buffer_meta *meta;
+ unsigned long *subbuf_mask;
+ unsigned long delta;
+ void *subbuf;
+ int cpu;
+ int i;
+
+ /* Create a mask to test the subbuf array */
+ subbuf_mask = bitmap_alloc(nr_pages + 1, GFP_KERNEL);
+ /* If subbuf_mask fails to allocate, then rb_meta_valid() will return false */
+
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+ void *next_meta;
+
+ meta = rb_range_meta(buffer, nr_pages, cpu);
+
+ if (rb_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) {
+ /* Make the mappings match the current address */
+ subbuf = rb_subbufs_from_meta(meta);
+ delta = (unsigned long)subbuf - meta->first_buffer;
+ meta->first_buffer += delta;
+ meta->head_buffer += delta;
+ meta->commit_buffer += delta;
+ buffer->last_text_delta = THIS_TEXT_PTR - meta->text_addr;
+ buffer->last_data_delta = THIS_DATA_PTR - meta->data_addr;
+ continue;
+ }
+
+ if (cpu < nr_cpu_ids - 1)
+ next_meta = rb_range_meta(buffer, nr_pages, cpu + 1);
+ else
+ next_meta = (void *)buffer->range_addr_end;
+
+ memset(meta, 0, next_meta - (void *)meta);
+
+ meta->magic = RING_BUFFER_META_MAGIC;
+ meta->struct_size = sizeof(*meta);
+
+ meta->nr_subbufs = nr_pages + 1;
+ meta->subbuf_size = PAGE_SIZE;
+
+ subbuf = rb_subbufs_from_meta(meta);
+
+ meta->first_buffer = (unsigned long)subbuf;
+ rb_meta_init_text_addr(meta);
+
+ /*
+ * The buffers[] array holds the order of the sub-buffers
+ * that are after the meta data. The sub-buffers may
+ * be swapped out when read and inserted into a different
+ * location of the ring buffer. Although their addresses
+ * remain the same, the buffers[] array contains the
+ * index into the sub-buffers holding their actual order.
+ */
+ for (i = 0; i < meta->nr_subbufs; i++) {
+ meta->buffers[i] = i;
+ rb_init_page(subbuf);
+ subbuf += meta->subbuf_size;
+ }
+ }
+ bitmap_free(subbuf_mask);
+}
+
+static void *rbm_start(struct seq_file *m, loff_t *pos)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = m->private;
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ unsigned long val;
+
+ if (!meta)
+ return NULL;
+
+ if (*pos > meta->nr_subbufs)
+ return NULL;
+
+ val = *pos;
+ val++;
+
+ return (void *)val;
+}
+
+static void *rbm_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+
+ return rbm_start(m, pos);
+}
+
+static int rbm_show(struct seq_file *m, void *v)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = m->private;
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ unsigned long val = (unsigned long)v;
+
+ if (val == 1) {
+ seq_printf(m, "head_buffer: %d\n",
+ rb_meta_subbuf_idx(meta, (void *)meta->head_buffer));
+ seq_printf(m, "commit_buffer: %d\n",
+ rb_meta_subbuf_idx(meta, (void *)meta->commit_buffer));
+ seq_printf(m, "subbuf_size: %d\n", meta->subbuf_size);
+ seq_printf(m, "nr_subbufs: %d\n", meta->nr_subbufs);
+ return 0;
+ }
+
+ val -= 2;
+ seq_printf(m, "buffer[%ld]: %d\n", val, meta->buffers[val]);
+
+ return 0;
+}
+
+static void rbm_stop(struct seq_file *m, void *p)
+{
+}
+
+static const struct seq_operations rb_meta_seq_ops = {
+ .start = rbm_start,
+ .next = rbm_next,
+ .show = rbm_show,
+ .stop = rbm_stop,
+};
+
+int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, int cpu)
+{
+ struct seq_file *m;
+ int ret;
+
+ ret = seq_open(file, &rb_meta_seq_ops);
+ if (ret)
+ return ret;
+
+ m = file->private_data;
+ m->private = buffer->buffers[cpu];
+
+ return 0;
+}
+
+/* Map the buffer_pages to the previous head and commit pages */
+static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *bpage)
+{
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+
+ if (meta->head_buffer == (unsigned long)bpage->page)
+ cpu_buffer->head_page = bpage;
+
+ if (meta->commit_buffer == (unsigned long)bpage->page) {
+ cpu_buffer->commit_page = bpage;
+ cpu_buffer->tail_page = bpage;
}
}
static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
long nr_pages, struct list_head *pages)
{
+ struct trace_buffer *buffer = cpu_buffer->buffer;
+ struct ring_buffer_meta *meta = NULL;
struct buffer_page *bpage, *tmp;
bool user_thread = current->mm != NULL;
gfp_t mflags;
@@ -1473,6 +2095,10 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
*/
if (user_thread)
set_current_oom_origin();
+
+ if (buffer->range_addr_start)
+ meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu);
+
for (i = 0; i < nr_pages; i++) {
struct page *page;
@@ -1483,15 +2109,32 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
rb_check_bpage(cpu_buffer, bpage);
- list_add(&bpage->list, pages);
-
- page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags,
- cpu_buffer->buffer->subbuf_order);
- if (!page)
- goto free_pages;
- bpage->page = page_address(page);
+ /*
+ * Append the pages as for mapped buffers we want to keep
+ * the order
+ */
+ list_add_tail(&bpage->list, pages);
+
+ if (meta) {
+ /* A range was given. Use that for the buffer page */
+ bpage->page = rb_range_buffer(cpu_buffer, i + 1);
+ if (!bpage->page)
+ goto free_pages;
+ /* If this is valid from a previous boot */
+ if (meta->head_buffer)
+ rb_meta_buffer_update(cpu_buffer, bpage);
+ bpage->range = 1;
+ bpage->id = i + 1;
+ } else {
+ page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
+ mflags | __GFP_COMP | __GFP_ZERO,
+ cpu_buffer->buffer->subbuf_order);
+ if (!page)
+ goto free_pages;
+ bpage->page = page_address(page);
+ rb_init_page(bpage->page);
+ }
bpage->order = cpu_buffer->buffer->subbuf_order;
- rb_init_page(bpage->page);
if (user_thread && fatal_signal_pending(current))
goto free_pages;
@@ -1541,6 +2184,7 @@ static struct ring_buffer_per_cpu *
rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_meta *meta;
struct buffer_page *bpage;
struct page *page;
int ret;
@@ -1560,6 +2204,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
init_waitqueue_head(&cpu_buffer->irq_work.waiters);
init_waitqueue_head(&cpu_buffer->irq_work.full_waiters);
+ mutex_init(&cpu_buffer->mapping_lock);
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL, cpu_to_node(cpu));
@@ -1570,11 +2215,28 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
cpu_buffer->reader_page = bpage;
- page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, cpu_buffer->buffer->subbuf_order);
- if (!page)
- goto fail_free_reader;
- bpage->page = page_address(page);
- rb_init_page(bpage->page);
+ if (buffer->range_addr_start) {
+ /*
+ * Range mapped buffers have the same restrictions as memory
+ * mapped ones do.
+ */
+ cpu_buffer->mapped = 1;
+ cpu_buffer->ring_meta = rb_range_meta(buffer, nr_pages, cpu);
+ bpage->page = rb_range_buffer(cpu_buffer, 0);
+ if (!bpage->page)
+ goto fail_free_reader;
+ if (cpu_buffer->ring_meta->head_buffer)
+ rb_meta_buffer_update(cpu_buffer, bpage);
+ bpage->range = 1;
+ } else {
+ page = alloc_pages_node(cpu_to_node(cpu),
+ GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
+ cpu_buffer->buffer->subbuf_order);
+ if (!page)
+ goto fail_free_reader;
+ bpage->page = page_address(page);
+ rb_init_page(bpage->page);
+ }
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
INIT_LIST_HEAD(&cpu_buffer->new_pages);
@@ -1583,11 +2245,35 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
if (ret < 0)
goto fail_free_reader;
- cpu_buffer->head_page
- = list_entry(cpu_buffer->pages, struct buffer_page, list);
- cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
+ rb_meta_validate_events(cpu_buffer);
+
+ /* If the boot meta was valid then this has already been updated */
+ meta = cpu_buffer->ring_meta;
+ if (!meta || !meta->head_buffer ||
+ !cpu_buffer->head_page || !cpu_buffer->commit_page || !cpu_buffer->tail_page) {
+ if (meta && meta->head_buffer &&
+ (cpu_buffer->head_page || cpu_buffer->commit_page || cpu_buffer->tail_page)) {
+ pr_warn("Ring buffer meta buffers not all mapped\n");
+ if (!cpu_buffer->head_page)
+ pr_warn(" Missing head_page\n");
+ if (!cpu_buffer->commit_page)
+ pr_warn(" Missing commit_page\n");
+ if (!cpu_buffer->tail_page)
+ pr_warn(" Missing tail_page\n");
+ }
- rb_head_page_activate(cpu_buffer);
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages, struct buffer_page, list);
+ cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
+
+ rb_head_page_activate(cpu_buffer);
+
+ if (cpu_buffer->ring_meta)
+ meta->commit_buffer = meta->head_buffer;
+ } else {
+ /* The valid meta buffer still needs to activate the head page */
+ rb_head_page_activate(cpu_buffer);
+ }
return cpu_buffer;
@@ -1624,22 +2310,14 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
kfree(cpu_buffer);
}
-/**
- * __ring_buffer_alloc - allocate a new ring_buffer
- * @size: the size in bytes per cpu that is needed.
- * @flags: attributes to set for the ring buffer.
- * @key: ring buffer reader_lock_key.
- *
- * Currently the only flag that is available is the RB_FL_OVERWRITE
- * flag. This flag means that the buffer will overwrite old data
- * when the buffer wraps. If this flag is not set, the buffer will
- * drop data when the tail hits the head.
- */
-struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
- struct lock_class_key *key)
+static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
+ int order, unsigned long start,
+ unsigned long end,
+ struct lock_class_key *key)
{
struct trace_buffer *buffer;
long nr_pages;
+ int subbuf_size;
int bsize;
int cpu;
int ret;
@@ -1653,14 +2331,13 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
goto fail_free_buffer;
- /* Default buffer page size - one system page */
- buffer->subbuf_order = 0;
- buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
+ buffer->subbuf_order = order;
+ subbuf_size = (PAGE_SIZE << order);
+ buffer->subbuf_size = subbuf_size - BUF_PAGE_HDR_SIZE;
/* Max payload is buffer page size - header (8bytes) */
buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2);
- nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
buffer->flags = flags;
buffer->clock = trace_clock_local;
buffer->reader_lock_key = key;
@@ -1668,10 +2345,6 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters);
init_waitqueue_head(&buffer->irq_work.waiters);
- /* need at least two pages */
- if (nr_pages < 2)
- nr_pages = 2;
-
buffer->cpus = nr_cpu_ids;
bsize = sizeof(void *) * nr_cpu_ids;
@@ -1680,6 +2353,56 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
if (!buffer->buffers)
goto fail_free_cpumask;
+ /* If start/end are specified, then that overrides size */
+ if (start && end) {
+ unsigned long ptr;
+ int n;
+
+ size = end - start;
+ size = size / nr_cpu_ids;
+
+ /*
+ * The number of sub-buffers (nr_pages) is determined by the
+ * total size allocated minus the meta data size.
+ * Then that is divided by the number of per CPU buffers
+ * needed, plus account for the integer array index that
+ * will be appended to the meta data.
+ */
+ nr_pages = (size - sizeof(struct ring_buffer_meta)) /
+ (subbuf_size + sizeof(int));
+ /* Need at least two pages plus the reader page */
+ if (nr_pages < 3)
+ goto fail_free_buffers;
+
+ again:
+ /* Make sure that the size fits aligned */
+ for (n = 0, ptr = start; n < nr_cpu_ids; n++) {
+ ptr += sizeof(struct ring_buffer_meta) +
+ sizeof(int) * nr_pages;
+ ptr = ALIGN(ptr, subbuf_size);
+ ptr += subbuf_size * nr_pages;
+ }
+ if (ptr > end) {
+ if (nr_pages <= 3)
+ goto fail_free_buffers;
+ nr_pages--;
+ goto again;
+ }
+
+ /* nr_pages should not count the reader page */
+ nr_pages--;
+ buffer->range_addr_start = start;
+ buffer->range_addr_end = end;
+
+ rb_range_meta_init(buffer, nr_pages);
+ } else {
+
+ /* need at least two pages */
+ nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
+ if (nr_pages < 2)
+ nr_pages = 2;
+ }
+
cpu = raw_smp_processor_id();
cpumask_set_cpu(cpu, buffer->cpumask);
buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
@@ -1708,9 +2431,73 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
kfree(buffer);
return NULL;
}
+
+/**
+ * __ring_buffer_alloc - allocate a new ring_buffer
+ * @size: the size in bytes per cpu that is needed.
+ * @flags: attributes to set for the ring buffer.
+ * @key: ring buffer reader_lock_key.
+ *
+ * Currently the only flag that is available is the RB_FL_OVERWRITE
+ * flag. This flag means that the buffer will overwrite old data
+ * when the buffer wraps. If this flag is not set, the buffer will
+ * drop data when the tail hits the head.
+ */
+struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
+ struct lock_class_key *key)
+{
+ /* Default buffer page size - one system page */
+ return alloc_buffer(size, flags, 0, 0, 0,key);
+
+}
EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
/**
+ * __ring_buffer_alloc_range - allocate a new ring_buffer from existing memory
+ * @size: the size in bytes per cpu that is needed.
+ * @flags: attributes to set for the ring buffer.
+ * @order: sub-buffer order
+ * @start: start of allocated range
+ * @range_size: size of allocated range
+ * @key: ring buffer reader_lock_key.
+ *
+ * Currently the only flag that is available is the RB_FL_OVERWRITE
+ * flag. This flag means that the buffer will overwrite old data
+ * when the buffer wraps. If this flag is not set, the buffer will
+ * drop data when the tail hits the head.
+ */
+struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags,
+ int order, unsigned long start,
+ unsigned long range_size,
+ struct lock_class_key *key)
+{
+ return alloc_buffer(size, flags, order, start, start + range_size, key);
+}
+
+/**
+ * ring_buffer_last_boot_delta - return the delta offset from last boot
+ * @buffer: The buffer to return the delta from
+ * @text: Return text delta
+ * @data: Return data delta
+ *
+ * Returns: The true if the delta is non zero
+ */
+bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text,
+ long *data)
+{
+ if (!buffer)
+ return false;
+
+ if (!buffer->last_text_delta)
+ return false;
+
+ *text = buffer->last_text_delta;
+ *data = buffer->last_data_delta;
+
+ return true;
+}
+
+/**
* ring_buffer_free - free a ring buffer.
* @buffer: the buffer to free.
*/
@@ -1749,8 +2536,6 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer)
return buffer->time_stamp_abs;
}
-static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
-
static inline unsigned long rb_page_entries(struct buffer_page *bpage)
{
return local_read(&bpage->entries) & RB_WRITE_MASK;
@@ -1819,6 +2604,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
/* make sure pages points to a valid page in the ring buffer */
cpu_buffer->pages = next_page;
+ cpu_buffer->cnt++;
/* update head page */
if (head_bit)
@@ -1925,6 +2711,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
* pointer to point to end of list
*/
head_page->prev = last_page;
+ cpu_buffer->cnt++;
success = true;
break;
}
@@ -2278,7 +3065,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
/* Size is determined by what has been committed */
static __always_inline unsigned rb_page_size(struct buffer_page *bpage)
{
- return rb_page_commit(bpage);
+ return rb_page_commit(bpage) & ~RB_MISSED_MASK;
}
static __always_inline unsigned
@@ -2317,6 +3104,52 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
iter->next_event = 0;
}
+/* Return the index into the sub-buffers for a given sub-buffer */
+static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf)
+{
+ void *subbuf_array;
+
+ subbuf_array = (void *)meta + sizeof(int) * meta->nr_subbufs;
+ subbuf_array = (void *)ALIGN((unsigned long)subbuf_array, meta->subbuf_size);
+ return (subbuf - subbuf_array) / meta->subbuf_size;
+}
+
+static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *next_page)
+{
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ unsigned long old_head = (unsigned long)next_page->page;
+ unsigned long new_head;
+
+ rb_inc_page(&next_page);
+ new_head = (unsigned long)next_page->page;
+
+ /*
+ * Only move it forward once, if something else came in and
+ * moved it forward, then we don't want to touch it.
+ */
+ (void)cmpxchg(&meta->head_buffer, old_head, new_head);
+}
+
+static void rb_update_meta_reader(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *reader)
+{
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ void *old_reader = cpu_buffer->reader_page->page;
+ void *new_reader = reader->page;
+ int id;
+
+ id = reader->id;
+ cpu_buffer->reader_page->id = id;
+ reader->id = 0;
+
+ meta->buffers[0] = rb_meta_subbuf_idx(meta, new_reader);
+ meta->buffers[id] = rb_meta_subbuf_idx(meta, old_reader);
+
+ /* The head pointer is the one after the reader */
+ rb_update_meta_head(cpu_buffer, reader);
+}
+
/*
* rb_handle_head_page - writer hit the head page
*
@@ -2366,6 +3199,8 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes);
local_inc(&cpu_buffer->pages_lost);
+ if (cpu_buffer->ring_meta)
+ rb_update_meta_head(cpu_buffer, next_page);
/*
* The entries will be zeroed out when we move the
* tail page.
@@ -2927,6 +3762,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
rb_inc_page(&cpu_buffer->commit_page);
+ if (cpu_buffer->ring_meta) {
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ meta->commit_buffer = (unsigned long)cpu_buffer->commit_page->page;
+ }
/* add barrier to keep gcc from optimizing too much */
barrier();
}
@@ -3241,7 +4080,7 @@ static const char *show_irq_str(int bits)
return type[bits];
}
-/* Assume this is an trace event */
+/* Assume this is a trace event */
static const char *show_flags(struct ring_buffer_event *event)
{
struct trace_entry *entry;
@@ -3373,11 +4212,10 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
struct rb_event_info *info,
unsigned long tail)
{
- struct ring_buffer_event *event;
struct buffer_data_page *bpage;
u64 ts, delta;
bool full = false;
- int e;
+ int ret;
bpage = info->tail_page->page;
@@ -3403,39 +4241,12 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
if (atomic_inc_return(this_cpu_ptr(&checking)) != 1)
goto out;
- ts = bpage->time_stamp;
-
- for (e = 0; e < tail; e += rb_event_length(event)) {
-
- event = (struct ring_buffer_event *)(bpage->data + e);
-
- switch (event->type_len) {
-
- case RINGBUF_TYPE_TIME_EXTEND:
- delta = rb_event_time_stamp(event);
- ts += delta;
- break;
-
- case RINGBUF_TYPE_TIME_STAMP:
- delta = rb_event_time_stamp(event);
- delta = rb_fix_abs_ts(delta, ts);
- if (delta < ts) {
- buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n",
- cpu_buffer->cpu, ts, delta);
- }
- ts = delta;
- break;
-
- case RINGBUF_TYPE_PADDING:
- if (event->time_delta == 1)
- break;
- fallthrough;
- case RINGBUF_TYPE_DATA:
- ts += event->time_delta;
- break;
-
- default:
- RB_WARN_ON(cpu_buffer, 1);
+ ret = rb_read_data_buffer(bpage, tail, cpu_buffer->cpu, &ts, &delta);
+ if (ret < 0) {
+ if (delta < ts) {
+ buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n",
+ cpu_buffer->cpu, ts, delta);
+ goto out;
}
}
if ((full && ts > info->ts) ||
@@ -3610,8 +4421,13 @@ rb_reserve_next_event(struct trace_buffer *buffer,
int nr_loops = 0;
int add_ts_default;
- /* ring buffer does cmpxchg, make sure it is safe in NMI context */
- if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) &&
+ /*
+ * ring buffer does cmpxchg as well as atomic64 operations
+ * (which some archs use locking for atomic64), make sure this
+ * is safe in NMI context
+ */
+ if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) ||
+ IS_ENABLED(CONFIG_GENERIC_ATOMIC64)) &&
(unlikely(in_nmi()))) {
return NULL;
}
@@ -3894,40 +4710,22 @@ int ring_buffer_write(struct trace_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_write);
-static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+/*
+ * The total entries in the ring buffer is the running counter
+ * of entries entered into the ring buffer, minus the sum of
+ * the entries read from the ring buffer and the number of
+ * entries that were overwritten.
+ */
+static inline unsigned long
+rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
{
- struct buffer_page *reader = cpu_buffer->reader_page;
- struct buffer_page *head = rb_set_head_page(cpu_buffer);
- struct buffer_page *commit = cpu_buffer->commit_page;
-
- /* In case of error, head will be NULL */
- if (unlikely(!head))
- return true;
-
- /* Reader should exhaust content in reader page */
- if (reader->read != rb_page_commit(reader))
- return false;
-
- /*
- * If writers are committing on the reader page, knowing all
- * committed content has been read, the ring buffer is empty.
- */
- if (commit == reader)
- return true;
-
- /*
- * If writers are committing on a page other than reader page
- * and head page, there should always be content to read.
- */
- if (commit != head)
- return false;
+ return local_read(&cpu_buffer->entries) -
+ (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
+}
- /*
- * Writers are committing on the head page, we just need
- * to care about there're committed data, and the reader will
- * swap reader page with head page when it is to read data.
- */
- return rb_page_commit(commit) == 0;
+static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return !rb_num_of_entries(cpu_buffer);
}
/**
@@ -4073,19 +4871,6 @@ void ring_buffer_record_enable_cpu(struct trace_buffer *buffer, int cpu)
}
EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
-/*
- * The total entries in the ring buffer is the running counter
- * of entries entered into the ring buffer, minus the sum of
- * the entries read from the ring buffer and the number of
- * entries that were overwritten.
- */
-static inline unsigned long
-rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
-{
- return local_read(&cpu_buffer->entries) -
- (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
-}
-
/**
* ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer
* @buffer: The ring buffer
@@ -4350,7 +5135,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
cpu_buffer = iter->cpu_buffer;
reader = cpu_buffer->reader_page;
head_page = cpu_buffer->head_page;
- commit_page = cpu_buffer->commit_page;
+ commit_page = READ_ONCE(cpu_buffer->commit_page);
commit_ts = commit_page->page->time_stamp;
/*
@@ -4376,7 +5161,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
return ((iter->head_page == commit_page && iter->head >= commit) ||
(iter->head_page == reader && commit_page == head_page &&
head_page->read == commit &&
- iter->head == rb_page_commit(cpu_buffer->reader_page)));
+ iter->head == rb_page_size(cpu_buffer->reader_page)));
}
EXPORT_SYMBOL_GPL(ring_buffer_iter_empty);
@@ -4544,6 +5329,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
if (!ret)
goto spin;
+ if (cpu_buffer->ring_meta)
+ rb_update_meta_reader(cpu_buffer, reader);
+
/*
* Yay! We succeeded in replacing the page.
*
@@ -4552,6 +5340,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
rb_inc_page(&cpu_buffer->head_page);
+ cpu_buffer->cnt++;
local_inc(&cpu_buffer->pages_read);
/* Finally update the reader page to the new head */
@@ -4996,13 +5785,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);
* @flags: gfp flags to use for memory allocation
*
* This performs the initial preparations necessary to iterate
- * through the buffer. Memory is allocated, buffer recording
+ * through the buffer. Memory is allocated, buffer resizing
* is disabled, and the iterator pointer is returned to the caller.
*
- * Disabling buffer recording prevents the reading from being
- * corrupted. This is not a consuming read, so a producer is not
- * expected.
- *
* After a sequence of ring_buffer_read_prepare calls, the user is
* expected to make at least one call to ring_buffer_read_prepare_sync.
* Afterwards, ring_buffer_read_start is invoked to get things going
@@ -5089,24 +5874,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_start);
* ring_buffer_read_finish - finish reading the iterator of the buffer
* @iter: The iterator retrieved by ring_buffer_start
*
- * This re-enables the recording to the buffer, and frees the
- * iterator.
+ * This re-enables resizing of the buffer, and frees the iterator.
*/
void
ring_buffer_read_finish(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
- unsigned long flags;
- /*
- * Ring buffer is disabled from recording, here's a good place
- * to check the integrity of the ring buffer.
- * Must prevent readers from trying to read, as the check
- * clears the HEAD page and readers require it.
- */
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ /* Use this opportunity to check the integrity of the ring buffer. */
rb_check_pages(cpu_buffer);
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
atomic_dec(&cpu_buffer->resize_disabled);
kfree(iter->event);
@@ -5171,6 +5947,25 @@ static void rb_clear_buffer_page(struct buffer_page *page)
page->read = 0;
}
+static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct trace_buffer_meta *meta = cpu_buffer->meta_page;
+
+ if (!meta)
+ return;
+
+ meta->reader.read = cpu_buffer->reader_page->read;
+ meta->reader.id = cpu_buffer->reader_page->id;
+ meta->reader.lost_events = cpu_buffer->lost_events;
+
+ meta->entries = local_read(&cpu_buffer->entries);
+ meta->overrun = local_read(&cpu_buffer->overrun);
+ meta->read = cpu_buffer->read;
+
+ /* Some archs do not have data cache coherency between kernel and user-space */
+ flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page));
+}
+
static void
rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
{
@@ -5217,6 +6012,14 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
rb_head_page_activate(cpu_buffer);
cpu_buffer->pages_removed = 0;
+
+ if (cpu_buffer->mapped) {
+ rb_update_meta_page(cpu_buffer);
+ if (cpu_buffer->ring_meta) {
+ struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ meta->commit_buffer = meta->head_buffer;
+ }
+ }
}
/* Must have disabled the cpu buffer then done a synchronize_rcu */
@@ -5247,6 +6050,7 @@ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ struct ring_buffer_meta *meta;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return;
@@ -5265,6 +6069,11 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
atomic_dec(&cpu_buffer->record_disabled);
atomic_dec(&cpu_buffer->resize_disabled);
+ /* Make sure persistent meta now uses this buffer's addresses */
+ meta = rb_range_meta(buffer, 0, cpu_buffer->cpu);
+ if (meta)
+ rb_meta_init_text_addr(meta);
+
mutex_unlock(&buffer->mutex);
}
EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
@@ -5279,6 +6088,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_meta *meta;
int cpu;
/* prevent another thread from changing buffer sizes */
@@ -5306,6 +6116,11 @@ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
reset_disabled_cpu_buffer(cpu_buffer);
+ /* Make sure persistent meta now uses this buffer's addresses */
+ meta = rb_range_meta(buffer, 0, cpu_buffer->cpu);
+ if (meta)
+ rb_meta_init_text_addr(meta);
+
atomic_dec(&cpu_buffer->record_disabled);
atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled);
}
@@ -5429,6 +6244,12 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
cpu_buffer_a = buffer_a->buffers[cpu];
cpu_buffer_b = buffer_b->buffers[cpu];
+ /* It's up to the callers to not try to swap mapped buffers */
+ if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
/* At least make sure the two buffers are somewhat the same */
if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
goto out;
@@ -5538,7 +6359,8 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
if (bpage->data)
goto out;
- page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY,
+ page = alloc_pages_node(cpu_to_node(cpu),
+ GFP_KERNEL | __GFP_NORETRY | __GFP_COMP | __GFP_ZERO,
cpu_buffer->buffer->subbuf_order);
if (!page) {
kfree(bpage);
@@ -5679,7 +6501,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
event = rb_reader_event(cpu_buffer);
read = reader->read;
- commit = rb_page_commit(reader);
+ commit = rb_page_size(reader);
/* Check if any events were dropped */
missed_events = cpu_buffer->lost_events;
@@ -5692,7 +6514,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
* Otherwise, we can simply swap the page with the one passed in.
*/
if (read || (len < (commit - read)) ||
- cpu_buffer->reader_page == cpu_buffer->commit_page) {
+ cpu_buffer->reader_page == cpu_buffer->commit_page ||
+ cpu_buffer->mapped) {
struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
unsigned int rpos = read;
unsigned int pos = 0;
@@ -5755,7 +6578,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
} else {
/* update the entry counter */
cpu_buffer->read += rb_page_entries(reader);
- cpu_buffer->read_bytes += rb_page_commit(reader);
+ cpu_buffer->read_bytes += rb_page_size(reader);
/* swap the pages */
rb_init_page(bpage);
@@ -5915,6 +6738,11 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
cpu_buffer = buffer->buffers[cpu];
+ if (cpu_buffer->mapped) {
+ err = -EBUSY;
+ goto error;
+ }
+
/* Update the number of pages to match the new size */
nr_pages = old_size * buffer->buffers[cpu]->nr_pages;
nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size);
@@ -5939,39 +6767,39 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
}
for_each_buffer_cpu(buffer, cpu) {
+ struct buffer_data_page *old_free_data_page;
+ struct list_head old_pages;
+ unsigned long flags;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
continue;
cpu_buffer = buffer->buffers[cpu];
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
/* Clear the head bit to make the link list normal to read */
rb_head_page_deactivate(cpu_buffer);
- /* Now walk the list and free all the old sub buffers */
- list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) {
- list_del_init(&bpage->list);
- free_buffer_page(bpage);
- }
- /* The above loop stopped an the last page needing to be freed */
- bpage = list_entry(cpu_buffer->pages, struct buffer_page, list);
- free_buffer_page(bpage);
-
- /* Free the current reader page */
- free_buffer_page(cpu_buffer->reader_page);
+ /*
+ * Collect buffers from the cpu_buffer pages list and the
+ * reader_page on old_pages, so they can be freed later when not
+ * under a spinlock. The pages list is a linked list with no
+ * head, adding old_pages turns it into a regular list with
+ * old_pages being the head.
+ */
+ list_add(&old_pages, cpu_buffer->pages);
+ list_add(&cpu_buffer->reader_page->list, &old_pages);
/* One page was allocated for the reader page */
cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next,
struct buffer_page, list);
list_del_init(&cpu_buffer->reader_page->list);
- /* The cpu_buffer pages are a link list with no head */
+ /* Install the new pages, remove the head from the list */
cpu_buffer->pages = cpu_buffer->new_pages.next;
- cpu_buffer->new_pages.next->prev = cpu_buffer->new_pages.prev;
- cpu_buffer->new_pages.prev->next = cpu_buffer->new_pages.next;
-
- /* Clear the new_pages list */
- INIT_LIST_HEAD(&cpu_buffer->new_pages);
+ list_del_init(&cpu_buffer->new_pages);
+ cpu_buffer->cnt++;
cpu_buffer->head_page
= list_entry(cpu_buffer->pages, struct buffer_page, list);
@@ -5980,11 +6808,20 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update;
cpu_buffer->nr_pages_to_update = 0;
- free_pages((unsigned long)cpu_buffer->free_page, old_order);
+ old_free_data_page = cpu_buffer->free_page;
cpu_buffer->free_page = NULL;
rb_head_page_activate(cpu_buffer);
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ /* Free old sub buffers */
+ list_for_each_entry_safe(bpage, tmp, &old_pages, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ free_pages((unsigned long)old_free_data_page, old_order);
+
rb_check_pages(cpu_buffer);
}
@@ -6016,6 +6853,441 @@ error:
}
EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set);
+static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct page *page;
+
+ if (cpu_buffer->meta_page)
+ return 0;
+
+ page = alloc_page(GFP_USER | __GFP_ZERO);
+ if (!page)
+ return -ENOMEM;
+
+ cpu_buffer->meta_page = page_to_virt(page);
+
+ return 0;
+}
+
+static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ unsigned long addr = (unsigned long)cpu_buffer->meta_page;
+
+ free_page(addr);
+ cpu_buffer->meta_page = NULL;
+}
+
+static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned long *subbuf_ids)
+{
+ struct trace_buffer_meta *meta = cpu_buffer->meta_page;
+ unsigned int nr_subbufs = cpu_buffer->nr_pages + 1;
+ struct buffer_page *first_subbuf, *subbuf;
+ int id = 0;
+
+ subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page;
+ cpu_buffer->reader_page->id = id++;
+
+ first_subbuf = subbuf = rb_set_head_page(cpu_buffer);
+ do {
+ if (WARN_ON(id >= nr_subbufs))
+ break;
+
+ subbuf_ids[id] = (unsigned long)subbuf->page;
+ subbuf->id = id;
+
+ rb_inc_page(&subbuf);
+ id++;
+ } while (subbuf != first_subbuf);
+
+ /* install subbuf ID to kern VA translation */
+ cpu_buffer->subbuf_ids = subbuf_ids;
+
+ meta->meta_struct_len = sizeof(*meta);
+ meta->nr_subbufs = nr_subbufs;
+ meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+ meta->meta_page_size = meta->subbuf_size;
+
+ rb_update_meta_page(cpu_buffer);
+}
+
+static struct ring_buffer_per_cpu *
+rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return ERR_PTR(-EINVAL);
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ mutex_lock(&cpu_buffer->mapping_lock);
+
+ if (!cpu_buffer->user_mapped) {
+ mutex_unlock(&cpu_buffer->mapping_lock);
+ return ERR_PTR(-ENODEV);
+ }
+
+ return cpu_buffer;
+}
+
+static void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ mutex_unlock(&cpu_buffer->mapping_lock);
+}
+
+/*
+ * Fast-path for rb_buffer_(un)map(). Called whenever the meta-page doesn't need
+ * to be set-up or torn-down.
+ */
+static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer,
+ bool inc)
+{
+ unsigned long flags;
+
+ lockdep_assert_held(&cpu_buffer->mapping_lock);
+
+ /* mapped is always greater or equal to user_mapped */
+ if (WARN_ON(cpu_buffer->mapped < cpu_buffer->user_mapped))
+ return -EINVAL;
+
+ if (inc && cpu_buffer->mapped == UINT_MAX)
+ return -EBUSY;
+
+ if (WARN_ON(!inc && cpu_buffer->user_mapped == 0))
+ return -EINVAL;
+
+ mutex_lock(&cpu_buffer->buffer->mutex);
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+ if (inc) {
+ cpu_buffer->user_mapped++;
+ cpu_buffer->mapped++;
+ } else {
+ cpu_buffer->user_mapped--;
+ cpu_buffer->mapped--;
+ }
+
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ mutex_unlock(&cpu_buffer->buffer->mutex);
+
+ return 0;
+}
+
+/*
+ * +--------------+ pgoff == 0
+ * | meta page |
+ * +--------------+ pgoff == 1
+ * | subbuffer 0 |
+ * | |
+ * +--------------+ pgoff == (1 + (1 << subbuf_order))
+ * | subbuffer 1 |
+ * | |
+ * ...
+ */
+#ifdef CONFIG_MMU
+static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
+ struct vm_area_struct *vma)
+{
+ unsigned long nr_subbufs, nr_pages, nr_vma_pages, pgoff = vma->vm_pgoff;
+ unsigned int subbuf_pages, subbuf_order;
+ struct page **pages;
+ int p = 0, s = 0;
+ int err;
+
+ /* Refuse MP_PRIVATE or writable mappings */
+ if (vma->vm_flags & VM_WRITE || vma->vm_flags & VM_EXEC ||
+ !(vma->vm_flags & VM_MAYSHARE))
+ return -EPERM;
+
+ subbuf_order = cpu_buffer->buffer->subbuf_order;
+ subbuf_pages = 1 << subbuf_order;
+
+ if (subbuf_order && pgoff % subbuf_pages)
+ return -EINVAL;
+
+ /*
+ * Make sure the mapping cannot become writable later. Also tell the VM
+ * to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND).
+ */
+ vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP,
+ VM_MAYWRITE);
+
+ lockdep_assert_held(&cpu_buffer->mapping_lock);
+
+ nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */
+ nr_pages = ((nr_subbufs + 1) << subbuf_order); /* + meta-page */
+ if (nr_pages <= pgoff)
+ return -EINVAL;
+
+ nr_pages -= pgoff;
+
+ nr_vma_pages = vma_pages(vma);
+ if (!nr_vma_pages || nr_vma_pages > nr_pages)
+ return -EINVAL;
+
+ nr_pages = nr_vma_pages;
+
+ pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ if (!pgoff) {
+ unsigned long meta_page_padding;
+
+ pages[p++] = virt_to_page(cpu_buffer->meta_page);
+
+ /*
+ * Pad with the zero-page to align the meta-page with the
+ * sub-buffers.
+ */
+ meta_page_padding = subbuf_pages - 1;
+ while (meta_page_padding-- && p < nr_pages) {
+ unsigned long __maybe_unused zero_addr =
+ vma->vm_start + (PAGE_SIZE * p);
+
+ pages[p++] = ZERO_PAGE(zero_addr);
+ }
+ } else {
+ /* Skip the meta-page */
+ pgoff -= subbuf_pages;
+
+ s += pgoff / subbuf_pages;
+ }
+
+ while (p < nr_pages) {
+ struct page *page;
+ int off = 0;
+
+ if (WARN_ON_ONCE(s >= nr_subbufs)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]);
+
+ for (; off < (1 << (subbuf_order)); off++, page++) {
+ if (p >= nr_pages)
+ break;
+
+ pages[p++] = page;
+ }
+ s++;
+ }
+
+ err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
+
+out:
+ kfree(pages);
+
+ return err;
+}
+#else
+static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
+ struct vm_area_struct *vma)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+int ring_buffer_map(struct trace_buffer *buffer, int cpu,
+ struct vm_area_struct *vma)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags, *subbuf_ids;
+ int err = 0;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -EINVAL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ mutex_lock(&cpu_buffer->mapping_lock);
+
+ if (cpu_buffer->user_mapped) {
+ err = __rb_map_vma(cpu_buffer, vma);
+ if (!err)
+ err = __rb_inc_dec_mapped(cpu_buffer, true);
+ mutex_unlock(&cpu_buffer->mapping_lock);
+ return err;
+ }
+
+ /* prevent another thread from changing buffer/sub-buffer sizes */
+ mutex_lock(&buffer->mutex);
+
+ err = rb_alloc_meta_page(cpu_buffer);
+ if (err)
+ goto unlock;
+
+ /* subbuf_ids include the reader while nr_pages does not */
+ subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL);
+ if (!subbuf_ids) {
+ rb_free_meta_page(cpu_buffer);
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ atomic_inc(&cpu_buffer->resize_disabled);
+
+ /*
+ * Lock all readers to block any subbuf swap until the subbuf IDs are
+ * assigned.
+ */
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ rb_setup_ids_meta_page(cpu_buffer, subbuf_ids);
+
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ err = __rb_map_vma(cpu_buffer, vma);
+ if (!err) {
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ /* This is the first time it is mapped by user */
+ cpu_buffer->mapped++;
+ cpu_buffer->user_mapped = 1;
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ } else {
+ kfree(cpu_buffer->subbuf_ids);
+ cpu_buffer->subbuf_ids = NULL;
+ rb_free_meta_page(cpu_buffer);
+ atomic_dec(&cpu_buffer->resize_disabled);
+ }
+
+unlock:
+ mutex_unlock(&buffer->mutex);
+ mutex_unlock(&cpu_buffer->mapping_lock);
+
+ return err;
+}
+
+int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags;
+ int err = 0;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -EINVAL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ mutex_lock(&cpu_buffer->mapping_lock);
+
+ if (!cpu_buffer->user_mapped) {
+ err = -ENODEV;
+ goto out;
+ } else if (cpu_buffer->user_mapped > 1) {
+ __rb_inc_dec_mapped(cpu_buffer, false);
+ goto out;
+ }
+
+ mutex_lock(&buffer->mutex);
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+ /* This is the last user space mapping */
+ if (!WARN_ON_ONCE(cpu_buffer->mapped < cpu_buffer->user_mapped))
+ cpu_buffer->mapped--;
+ cpu_buffer->user_mapped = 0;
+
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ kfree(cpu_buffer->subbuf_ids);
+ cpu_buffer->subbuf_ids = NULL;
+ rb_free_meta_page(cpu_buffer);
+ atomic_dec(&cpu_buffer->resize_disabled);
+
+ mutex_unlock(&buffer->mutex);
+
+out:
+ mutex_unlock(&cpu_buffer->mapping_lock);
+
+ return err;
+}
+
+int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_page *reader;
+ unsigned long missed_events;
+ unsigned long reader_size;
+ unsigned long flags;
+
+ cpu_buffer = rb_get_mapped_buffer(buffer, cpu);
+ if (IS_ERR(cpu_buffer))
+ return (int)PTR_ERR(cpu_buffer);
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+consume:
+ if (rb_per_cpu_empty(cpu_buffer))
+ goto out;
+
+ reader_size = rb_page_size(cpu_buffer->reader_page);
+
+ /*
+ * There are data to be read on the current reader page, we can
+ * return to the caller. But before that, we assume the latter will read
+ * everything. Let's update the kernel reader accordingly.
+ */
+ if (cpu_buffer->reader_page->read < reader_size) {
+ while (cpu_buffer->reader_page->read < reader_size)
+ rb_advance_reader(cpu_buffer);
+ goto out;
+ }
+
+ reader = rb_get_reader_page(cpu_buffer);
+ if (WARN_ON(!reader))
+ goto out;
+
+ /* Check if any events were dropped */
+ missed_events = cpu_buffer->lost_events;
+
+ if (cpu_buffer->reader_page != cpu_buffer->commit_page) {
+ if (missed_events) {
+ struct buffer_data_page *bpage = reader->page;
+ unsigned int commit;
+ /*
+ * Use the real_end for the data size,
+ * This gives us a chance to store the lost events
+ * on the page.
+ */
+ if (reader->real_end)
+ local_set(&bpage->commit, reader->real_end);
+ /*
+ * If there is room at the end of the page to save the
+ * missed events, then record it there.
+ */
+ commit = rb_page_size(reader);
+ if (buffer->subbuf_size - commit >= sizeof(missed_events)) {
+ memcpy(&bpage->data[commit], &missed_events,
+ sizeof(missed_events));
+ local_add(RB_MISSED_STORED, &bpage->commit);
+ }
+ local_add(RB_MISSED_EVENTS, &bpage->commit);
+ }
+ } else {
+ /*
+ * There really shouldn't be any missed events if the commit
+ * is on the reader page.
+ */
+ WARN_ON_ONCE(missed_events);
+ }
+
+ cpu_buffer->lost_events = 0;
+
+ goto consume;
+
+out:
+ /* Some archs do not have data cache coherency between kernel and user-space */
+ flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page));
+
+ rb_update_meta_page(cpu_buffer);
+
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ rb_put_mapped_buffer(cpu_buffer);
+
+ return 0;
+}
+
/*
* We only allocate new buffers, never free them if the CPU goes down.
* If we were to free the buffer, then the user would lose any trace that was in
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 008187ebd7fe..cdc3aea12c93 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -307,14 +307,14 @@ static void ring_buffer_producer(void)
if (!disable_reader) {
if (consumer_fifo)
trace_printk("Running Consumer at SCHED_FIFO %s\n",
- consumer_fifo == 1 ? "low" : "high");
+ str_low_high(consumer_fifo == 1));
else
trace_printk("Running Consumer at nice: %d\n",
consumer_nice);
}
if (producer_fifo)
trace_printk("Running Producer at SCHED_FIFO %s\n",
- producer_fifo == 1 ? "low" : "high");
+ str_low_high(producer_fifo == 1));
else
trace_printk("Running Producer at nice: %d\n",
producer_nice);
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index 831779607e84..8226352a0062 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -25,30 +25,9 @@ menuconfig RV
For further information, see:
Documentation/trace/rv/runtime-verification.rst
-config RV_MON_WIP
- depends on RV
- depends on PREEMPT_TRACER
- select DA_MON_EVENTS_IMPLICIT
- bool "wip monitor"
- help
- Enable wip (wakeup in preemptive) sample monitor that illustrates
- the usage of per-cpu monitors, and one limitation of the
- preempt_disable/enable events.
-
- For further information, see:
- Documentation/trace/rv/monitor_wip.rst
-
-config RV_MON_WWNR
- depends on RV
- select DA_MON_EVENTS_ID
- bool "wwnr monitor"
- help
- Enable wwnr (wakeup while not running) sample monitor, this is a
- sample monitor that illustrates the usage of per-task monitor.
- The model is borken on purpose: it serves to test reactors.
-
- For further information, see:
- Documentation/trace/rv/monitor_wwnr.rst
+source "kernel/trace/rv/monitors/wip/Kconfig"
+source "kernel/trace/rv/monitors/wwnr/Kconfig"
+# Add new monitors here
config RV_REACTORS
bool "Runtime verification reactors"
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
index 963d14875b45..188b64668e1f 100644
--- a/kernel/trace/rv/Makefile
+++ b/kernel/trace/rv/Makefile
@@ -1,8 +1,11 @@
# SPDX-License-Identifier: GPL-2.0
+ccflags-y += -I $(src) # needed for trace events
+
obj-$(CONFIG_RV) += rv.o
obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o
obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o
+# Add new monitors here
obj-$(CONFIG_RV_REACTORS) += rv_reactors.o
obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o
obj-$(CONFIG_RV_REACT_PANIC) += reactor_panic.o
diff --git a/kernel/trace/rv/monitors/wip/Kconfig b/kernel/trace/rv/monitors/wip/Kconfig
new file mode 100644
index 000000000000..3ef664b5cd90
--- /dev/null
+++ b/kernel/trace/rv/monitors/wip/Kconfig
@@ -0,0 +1,12 @@
+config RV_MON_WIP
+ depends on RV
+ depends on PREEMPT_TRACER
+ select DA_MON_EVENTS_IMPLICIT
+ bool "wip monitor"
+ help
+ Enable wip (wakeup in preemptive) sample monitor that illustrates
+ the usage of per-cpu monitors, and one limitation of the
+ preempt_disable/enable events.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_wip.rst
diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c
index b2b49a27e886..db7389157c87 100644
--- a/kernel/trace/rv/monitors/wip/wip.c
+++ b/kernel/trace/rv/monitors/wip/wip.c
@@ -10,7 +10,7 @@
#define MODULE_NAME "wip"
-#include <trace/events/rv.h>
+#include <rv_trace.h>
#include <trace/events/sched.h>
#include <trace/events/preemptirq.h>
diff --git a/kernel/trace/rv/monitors/wip/wip_trace.h b/kernel/trace/rv/monitors/wip/wip_trace.h
new file mode 100644
index 000000000000..aa2162f47a4c
--- /dev/null
+++ b/kernel/trace/rv/monitors/wip/wip_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_WIP
+DEFINE_EVENT(event_da_monitor, event_wip,
+ TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor, error_wip,
+ TP_PROTO(char *state, char *event),
+ TP_ARGS(state, event));
+#endif /* CONFIG_RV_MON_WIP */
diff --git a/kernel/trace/rv/monitors/wwnr/Kconfig b/kernel/trace/rv/monitors/wwnr/Kconfig
new file mode 100644
index 000000000000..ee741aa6d6b8
--- /dev/null
+++ b/kernel/trace/rv/monitors/wwnr/Kconfig
@@ -0,0 +1,11 @@
+config RV_MON_WWNR
+ depends on RV
+ select DA_MON_EVENTS_ID
+ bool "wwnr monitor"
+ help
+ Enable wwnr (wakeup while not running) sample monitor, this is a
+ sample monitor that illustrates the usage of per-task monitor.
+ The model is borken on purpose: it serves to test reactors.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_wwnr.rst
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c
index 0e43dd2db685..3b16994a9984 100644
--- a/kernel/trace/rv/monitors/wwnr/wwnr.c
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.c
@@ -10,7 +10,7 @@
#define MODULE_NAME "wwnr"
-#include <trace/events/rv.h>
+#include <rv_trace.h>
#include <trace/events/sched.h>
#include "wwnr.h"
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr_trace.h b/kernel/trace/rv/monitors/wwnr/wwnr_trace.h
new file mode 100644
index 000000000000..fc97ec7476ad
--- /dev/null
+++ b/kernel/trace/rv/monitors/wwnr/wwnr_trace.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_WWNR
+/* id is the pid of the task */
+DEFINE_EVENT(event_da_monitor_id, event_wwnr,
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(id, state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor_id, error_wwnr,
+ TP_PROTO(int id, char *state, char *event),
+ TP_ARGS(id, state, event));
+#endif /* CONFIG_RV_MON_WWNR */
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index 2f68e93fff0b..8657fc8806e7 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -41,7 +41,7 @@
* per-task monitor, and so on), and the helper functions that glue the
* monitor to the system via trace. Generally, a monitor includes some form
* of trace output as a reaction for event parsing and exceptions,
- * as depicted bellow:
+ * as depicted below:
*
* Linux +----- RV Monitor ----------------------------------+ Formal
* Realm | | Realm
@@ -145,7 +145,7 @@
#ifdef CONFIG_DA_MON_EVENTS
#define CREATE_TRACE_POINTS
-#include <trace/events/rv.h>
+#include <rv_trace.h>
#endif
#include "rv.h"
@@ -245,6 +245,7 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync)
/**
* rv_disable_monitor - disable a given runtime monitor
+ * @mdef: Pointer to the monitor definition structure.
*
* Returns 0 on success.
*/
@@ -256,6 +257,7 @@ int rv_disable_monitor(struct rv_monitor_def *mdef)
/**
* rv_enable_monitor - enable a given runtime monitor
+ * @mdef: Pointer to the monitor definition structure.
*
* Returns 0 on success, error otherwise.
*/
@@ -304,7 +306,6 @@ static ssize_t monitor_enable_write_data(struct file *filp, const char __user *u
static const struct file_operations interface_enable_fops = {
.open = simple_open,
- .llseek = no_llseek,
.write = monitor_enable_write_data,
.read = monitor_enable_read_data,
};
@@ -327,7 +328,6 @@ static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf,
static const struct file_operations interface_desc_fops = {
.open = simple_open,
- .llseek = no_llseek,
.read = monitor_desc_read_data,
};
@@ -672,7 +672,6 @@ static ssize_t monitoring_on_write_data(struct file *filp, const char __user *us
static const struct file_operations monitoring_on_fops = {
.open = simple_open,
- .llseek = no_llseek,
.write = monitoring_on_write_data,
.read = monitoring_on_read_data,
};
diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c
index 6aae106695b6..7b49cbe388d4 100644
--- a/kernel/trace/rv/rv_reactors.c
+++ b/kernel/trace/rv/rv_reactors.c
@@ -426,7 +426,6 @@ static ssize_t reacting_on_write_data(struct file *filp, const char __user *user
static const struct file_operations reacting_on_fops = {
.open = simple_open,
- .llseek = no_llseek,
.write = reacting_on_write_data,
.read = reacting_on_read_data,
};
diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h
new file mode 100644
index 000000000000..5e65097423ba
--- /dev/null
+++ b/kernel/trace/rv/rv_trace.h
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rv
+
+#if !defined(_TRACE_RV_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RV_H
+
+#include <linux/rv.h>
+#include <linux/tracepoint.h>
+
+#ifdef CONFIG_DA_MON_EVENTS_IMPLICIT
+DECLARE_EVENT_CLASS(event_da_monitor,
+
+ TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+
+ TP_ARGS(state, event, next_state, final_state),
+
+ TP_STRUCT__entry(
+ __array( char, state, MAX_DA_NAME_LEN )
+ __array( char, event, MAX_DA_NAME_LEN )
+ __array( char, next_state, MAX_DA_NAME_LEN )
+ __field( bool, final_state )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->state, state, MAX_DA_NAME_LEN);
+ memcpy(__entry->event, event, MAX_DA_NAME_LEN);
+ memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN);
+ __entry->final_state = final_state;
+ ),
+
+ TP_printk("%s x %s -> %s %s",
+ __entry->state,
+ __entry->event,
+ __entry->next_state,
+ __entry->final_state ? "(final)" : "")
+);
+
+DECLARE_EVENT_CLASS(error_da_monitor,
+
+ TP_PROTO(char *state, char *event),
+
+ TP_ARGS(state, event),
+
+ TP_STRUCT__entry(
+ __array( char, state, MAX_DA_NAME_LEN )
+ __array( char, event, MAX_DA_NAME_LEN )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->state, state, MAX_DA_NAME_LEN);
+ memcpy(__entry->event, event, MAX_DA_NAME_LEN);
+ ),
+
+ TP_printk("event %s not expected in the state %s",
+ __entry->event,
+ __entry->state)
+);
+
+#include <monitors/wip/wip_trace.h>
+// Add new monitors based on CONFIG_DA_MON_EVENTS_IMPLICIT here
+
+#endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */
+
+#ifdef CONFIG_DA_MON_EVENTS_ID
+DECLARE_EVENT_CLASS(event_da_monitor_id,
+
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+
+ TP_ARGS(id, state, event, next_state, final_state),
+
+ TP_STRUCT__entry(
+ __field( int, id )
+ __array( char, state, MAX_DA_NAME_LEN )
+ __array( char, event, MAX_DA_NAME_LEN )
+ __array( char, next_state, MAX_DA_NAME_LEN )
+ __field( bool, final_state )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->state, state, MAX_DA_NAME_LEN);
+ memcpy(__entry->event, event, MAX_DA_NAME_LEN);
+ memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN);
+ __entry->id = id;
+ __entry->final_state = final_state;
+ ),
+
+ TP_printk("%d: %s x %s -> %s %s",
+ __entry->id,
+ __entry->state,
+ __entry->event,
+ __entry->next_state,
+ __entry->final_state ? "(final)" : "")
+);
+
+DECLARE_EVENT_CLASS(error_da_monitor_id,
+
+ TP_PROTO(int id, char *state, char *event),
+
+ TP_ARGS(id, state, event),
+
+ TP_STRUCT__entry(
+ __field( int, id )
+ __array( char, state, MAX_DA_NAME_LEN )
+ __array( char, event, MAX_DA_NAME_LEN )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->state, state, MAX_DA_NAME_LEN);
+ memcpy(__entry->event, event, MAX_DA_NAME_LEN);
+ __entry->id = id;
+ ),
+
+ TP_printk("%d: event %s not expected in the state %s",
+ __entry->id,
+ __entry->event,
+ __entry->state)
+);
+
+#include <monitors/wwnr/wwnr_trace.h>
+// Add new monitors based on CONFIG_DA_MON_EVENTS_ID here
+
+#endif /* CONFIG_DA_MON_EVENTS_ID */
+#endif /* _TRACE_RV_H */
+
+/* This part ust be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE rv_trace
+#include <trace/define_trace.h>
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c9c898307348..0e6d517e74e0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -13,7 +13,7 @@
* Copyright (C) 2004 Nadia Yvette Chambers
*/
#include <linux/ring_buffer.h>
-#include <generated/utsrelease.h>
+#include <linux/utsname.h>
#include <linux/stacktrace.h>
#include <linux/writeback.h>
#include <linux/kallsyms.h>
@@ -26,6 +26,7 @@
#include <linux/hardirq.h>
#include <linux/linkage.h>
#include <linux/uaccess.h>
+#include <linux/cleanup.h>
#include <linux/vmalloc.h>
#include <linux/ftrace.h>
#include <linux/module.h>
@@ -39,7 +40,6 @@
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/panic_notifier.h>
-#include <linux/kmemleak.h>
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/fs.h>
@@ -105,7 +105,7 @@ dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
* tracing is active, only save the comm when a trace event
* occurred.
*/
-static DEFINE_PER_CPU(bool, trace_taskinfo_save);
+DEFINE_PER_CPU(bool, trace_taskinfo_save);
/*
* Kill all tracing for good (never come back).
@@ -131,9 +131,12 @@ cpumask_var_t __read_mostly tracing_buffer_mask;
* /proc/sys/kernel/ftrace_dump_on_oops
* Set 1 if you want to dump buffers of all CPUs
* Set 2 if you want to dump the buffer of the CPU that triggered oops
+ * Set instance name if you want to dump the specific trace instance
+ * Multiple instance dump is also supported, and instances are seperated
+ * by commas.
*/
-
-enum ftrace_dump_mode ftrace_dump_on_oops;
+/* Set to string format zero to disable by default */
+char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0";
/* When set, tracing will stop when a WARN*() is hit */
int __disable_trace_on_warning;
@@ -179,7 +182,6 @@ static void ftrace_trace_userstack(struct trace_array *tr,
struct trace_buffer *buffer,
unsigned int trace_ctx);
-#define MAX_TRACER_SIZE 100
static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
static char *default_bootup_tracer;
@@ -202,19 +204,33 @@ static int __init set_cmdline_ftrace(char *str)
}
__setup("ftrace=", set_cmdline_ftrace);
+int ftrace_dump_on_oops_enabled(void)
+{
+ if (!strcmp("0", ftrace_dump_on_oops))
+ return 0;
+ else
+ return 1;
+}
+
static int __init set_ftrace_dump_on_oops(char *str)
{
- if (*str++ != '=' || !*str || !strcmp("1", str)) {
- ftrace_dump_on_oops = DUMP_ALL;
+ if (!*str) {
+ strscpy(ftrace_dump_on_oops, "1", MAX_TRACER_SIZE);
return 1;
}
- if (!strcmp("orig_cpu", str) || !strcmp("2", str)) {
- ftrace_dump_on_oops = DUMP_ORIG;
- return 1;
- }
+ if (*str == ',') {
+ strscpy(ftrace_dump_on_oops, "1", MAX_TRACER_SIZE);
+ strscpy(ftrace_dump_on_oops + 1, str, MAX_TRACER_SIZE - 1);
+ return 1;
+ }
+
+ if (*str++ == '=') {
+ strscpy(ftrace_dump_on_oops, str, MAX_TRACER_SIZE);
+ return 1;
+ }
- return 0;
+ return 0;
}
__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
@@ -467,7 +483,7 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export);
TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \
TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \
TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \
- TRACE_ITER_HASH_PTR)
+ TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK)
/* trace_options that are only supported by global_trace */
#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \
@@ -475,7 +491,7 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export);
/* trace_flags that are default zero for instances */
#define ZEROED_TRACE_FLAGS \
- (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK)
+ (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK)
/*
* The global_trace is the descriptor that holds the top-level tracing
@@ -485,6 +501,29 @@ static struct trace_array global_trace = {
.trace_flags = TRACE_DEFAULT_FLAGS,
};
+static struct trace_array *printk_trace = &global_trace;
+
+static __always_inline bool printk_binsafe(struct trace_array *tr)
+{
+ /*
+ * The binary format of traceprintk can cause a crash if used
+ * by a buffer from another boot. Force the use of the
+ * non binary version of trace_printk if the trace_printk
+ * buffer is a boot mapped ring buffer.
+ */
+ return !(tr->flags & TRACE_ARRAY_FL_BOOT);
+}
+
+static void update_printk_trace(struct trace_array *tr)
+{
+ if (printk_trace == tr)
+ return;
+
+ printk_trace->trace_flags &= ~TRACE_ITER_TRACE_PRINTK;
+ printk_trace = tr;
+ tr->trace_flags |= TRACE_ITER_TRACE_PRINTK;
+}
+
void trace_set_ring_buffer_expanded(struct trace_array *tr)
{
if (!tr)
@@ -497,19 +536,16 @@ LIST_HEAD(ftrace_trace_arrays);
int trace_array_get(struct trace_array *this_tr)
{
struct trace_array *tr;
- int ret = -ENODEV;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
if (tr == this_tr) {
tr->ref++;
- ret = 0;
- break;
+ return 0;
}
}
- mutex_unlock(&trace_types_lock);
- return ret;
+ return -ENODEV;
}
static void __trace_array_put(struct trace_array *this_tr)
@@ -555,19 +591,6 @@ int tracing_check_open_get_tr(struct trace_array *tr)
return 0;
}
-int call_filter_check_discard(struct trace_event_call *call, void *rec,
- struct trace_buffer *buffer,
- struct ring_buffer_event *event)
-{
- if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
- !filter_match_preds(call->filter, rec)) {
- __trace_event_discard_commit(buffer, event);
- return 1;
- }
-
- return 0;
-}
-
/**
* trace_find_filtered_pid - check if a pid exists in a filtered_pid list
* @filtered_pids: The list of pids to check
@@ -950,7 +973,8 @@ static inline void trace_access_lock_init(void)
#endif
#ifdef CONFIG_STACKTRACE
-static void __ftrace_trace_stack(struct trace_buffer *buffer,
+static void __ftrace_trace_stack(struct trace_array *tr,
+ struct trace_buffer *buffer,
unsigned int trace_ctx,
int skip, struct pt_regs *regs);
static inline void ftrace_trace_stack(struct trace_array *tr,
@@ -959,7 +983,8 @@ static inline void ftrace_trace_stack(struct trace_array *tr,
int skip, struct pt_regs *regs);
#else
-static inline void __ftrace_trace_stack(struct trace_buffer *buffer,
+static inline void __ftrace_trace_stack(struct trace_array *tr,
+ struct trace_buffer *buffer,
unsigned int trace_ctx,
int skip, struct pt_regs *regs)
{
@@ -1102,7 +1127,7 @@ EXPORT_SYMBOL_GPL(__trace_array_puts);
*/
int __trace_puts(unsigned long ip, const char *str, int size)
{
- return __trace_array_puts(&global_trace, ip, str, size);
+ return __trace_array_puts(printk_trace, ip, str, size);
}
EXPORT_SYMBOL_GPL(__trace_puts);
@@ -1113,6 +1138,7 @@ EXPORT_SYMBOL_GPL(__trace_puts);
*/
int __trace_bputs(unsigned long ip, const char *str)
{
+ struct trace_array *tr = READ_ONCE(printk_trace);
struct ring_buffer_event *event;
struct trace_buffer *buffer;
struct bputs_entry *entry;
@@ -1120,14 +1146,17 @@ int __trace_bputs(unsigned long ip, const char *str)
int size = sizeof(struct bputs_entry);
int ret = 0;
- if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
+ if (!printk_binsafe(tr))
+ return __trace_puts(ip, str, strlen(str));
+
+ if (!(tr->trace_flags & TRACE_ITER_PRINTK))
return 0;
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
trace_ctx = tracing_gen_ctx();
- buffer = global_trace.array_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
@@ -1140,7 +1169,7 @@ int __trace_bputs(unsigned long ip, const char *str)
entry->str = str;
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL);
+ ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL);
ret = 1;
out:
@@ -1176,6 +1205,12 @@ static void tracing_snapshot_instance_cond(struct trace_array *tr,
return;
}
+ if (tr->mapped) {
+ trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n");
+ trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
+ return;
+ }
+
local_irq_save(flags);
update_max_tr(tr, current, smp_processor_id(), cond_data);
local_irq_restore(flags);
@@ -1301,6 +1336,50 @@ static void free_snapshot(struct trace_array *tr)
tr->allocated_snapshot = false;
}
+static int tracing_arm_snapshot_locked(struct trace_array *tr)
+{
+ int ret;
+
+ lockdep_assert_held(&trace_types_lock);
+
+ spin_lock(&tr->snapshot_trigger_lock);
+ if (tr->snapshot == UINT_MAX || tr->mapped) {
+ spin_unlock(&tr->snapshot_trigger_lock);
+ return -EBUSY;
+ }
+
+ tr->snapshot++;
+ spin_unlock(&tr->snapshot_trigger_lock);
+
+ ret = tracing_alloc_snapshot_instance(tr);
+ if (ret) {
+ spin_lock(&tr->snapshot_trigger_lock);
+ tr->snapshot--;
+ spin_unlock(&tr->snapshot_trigger_lock);
+ }
+
+ return ret;
+}
+
+int tracing_arm_snapshot(struct trace_array *tr)
+{
+ int ret;
+
+ mutex_lock(&trace_types_lock);
+ ret = tracing_arm_snapshot_locked(tr);
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
+
+void tracing_disarm_snapshot(struct trace_array *tr)
+{
+ spin_lock(&tr->snapshot_trigger_lock);
+ if (!WARN_ON(!tr->snapshot))
+ tr->snapshot--;
+ spin_unlock(&tr->snapshot_trigger_lock);
+}
+
/**
* tracing_alloc_snapshot - allocate snapshot buffer.
*
@@ -1362,26 +1441,20 @@ EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
cond_update_fn_t update)
{
- struct cond_snapshot *cond_snapshot;
- int ret = 0;
+ struct cond_snapshot *cond_snapshot __free(kfree) =
+ kzalloc(sizeof(*cond_snapshot), GFP_KERNEL);
+ int ret;
- cond_snapshot = kzalloc(sizeof(*cond_snapshot), GFP_KERNEL);
if (!cond_snapshot)
return -ENOMEM;
cond_snapshot->cond_data = cond_data;
cond_snapshot->update = update;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
- ret = tracing_alloc_snapshot_instance(tr);
- if (ret)
- goto fail_unlock;
-
- if (tr->current_trace->use_max_tr) {
- ret = -EBUSY;
- goto fail_unlock;
- }
+ if (tr->current_trace->use_max_tr)
+ return -EBUSY;
/*
* The cond_snapshot can only change to NULL without the
@@ -1391,25 +1464,20 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
* do safely with only holding the trace_types_lock and not
* having to take the max_lock.
*/
- if (tr->cond_snapshot) {
- ret = -EBUSY;
- goto fail_unlock;
- }
+ if (tr->cond_snapshot)
+ return -EBUSY;
+
+ ret = tracing_arm_snapshot_locked(tr);
+ if (ret)
+ return ret;
local_irq_disable();
arch_spin_lock(&tr->max_lock);
- tr->cond_snapshot = cond_snapshot;
+ tr->cond_snapshot = no_free_ptr(cond_snapshot);
arch_spin_unlock(&tr->max_lock);
local_irq_enable();
- mutex_unlock(&trace_types_lock);
-
- return ret;
-
- fail_unlock:
- mutex_unlock(&trace_types_lock);
- kfree(cond_snapshot);
- return ret;
+ return 0;
}
EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable);
@@ -1440,6 +1508,8 @@ int tracing_snapshot_cond_disable(struct trace_array *tr)
arch_spin_unlock(&tr->max_lock);
local_irq_enable();
+ tracing_disarm_snapshot(tr);
+
return ret;
}
EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
@@ -1482,6 +1552,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr)
}
EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
#define free_snapshot(tr) do { } while (0)
+#define tracing_arm_snapshot_locked(tr) ({ -EBUSY; })
#endif /* CONFIG_TRACER_SNAPSHOT */
void tracer_tracing_off(struct trace_array *tr)
@@ -1839,7 +1910,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
max_data->critical_start = data->critical_start;
max_data->critical_end = data->critical_end;
- strncpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
+ strscpy(max_data->comm, tsk->comm);
max_data->pid = tsk->pid;
/*
* If tsk == current, then use current_uid(), as that does not use
@@ -1955,15 +2026,36 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
#endif /* CONFIG_TRACER_MAX_TRACE */
+struct pipe_wait {
+ struct trace_iterator *iter;
+ int wait_index;
+};
+
+static bool wait_pipe_cond(void *data)
+{
+ struct pipe_wait *pwait = data;
+ struct trace_iterator *iter = pwait->iter;
+
+ if (atomic_read_acquire(&iter->wait_index) != pwait->wait_index)
+ return true;
+
+ return iter->closed;
+}
+
static int wait_on_pipe(struct trace_iterator *iter, int full)
{
+ struct pipe_wait pwait;
int ret;
/* Iterators are static, they should be filled or empty */
if (trace_buffer_iter(iter, iter->cpu_file))
return 0;
- ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full);
+ pwait.wait_index = atomic_read_acquire(&iter->wait_index);
+ pwait.iter = iter;
+
+ ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full,
+ wait_pipe_cond, &pwait);
#ifdef CONFIG_TRACER_MAX_TRACE
/*
@@ -2098,10 +2190,10 @@ static __init int init_trace_selftests(void)
selftests_can_run = true;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
if (list_empty(&postponed_selftests))
- goto out;
+ return 0;
pr_info("Running postponed tracer tests:\n");
@@ -2130,17 +2222,10 @@ static __init int init_trace_selftests(void)
}
tracing_selftest_running = false;
- out:
- mutex_unlock(&trace_types_lock);
-
return 0;
}
core_initcall(init_trace_selftests);
#else
-static inline int run_tracer_selftest(struct tracer *type)
-{
- return 0;
-}
static inline int do_run_tracer_selftest(struct tracer *type)
{
return 0;
@@ -2274,6 +2359,25 @@ void tracing_reset_online_cpus(struct array_buffer *buf)
ring_buffer_record_enable(buffer);
}
+static void tracing_reset_all_cpus(struct array_buffer *buf)
+{
+ struct trace_buffer *buffer = buf->buffer;
+
+ if (!buffer)
+ return;
+
+ ring_buffer_record_disable(buffer);
+
+ /* Make sure all commits have finished */
+ synchronize_rcu();
+
+ buf->time_start = buffer_ftrace_now(buf, buf->cpu);
+
+ ring_buffer_reset(buffer);
+
+ ring_buffer_record_enable(buffer);
+}
+
/* Must have trace_types_lock held */
void tracing_reset_all_online_cpus_unlocked(void)
{
@@ -2299,98 +2403,6 @@ void tracing_reset_all_online_cpus(void)
mutex_unlock(&trace_types_lock);
}
-/*
- * The tgid_map array maps from pid to tgid; i.e. the value stored at index i
- * is the tgid last observed corresponding to pid=i.
- */
-static int *tgid_map;
-
-/* The maximum valid index into tgid_map. */
-static size_t tgid_map_max;
-
-#define SAVED_CMDLINES_DEFAULT 128
-#define NO_CMDLINE_MAP UINT_MAX
-/*
- * Preemption must be disabled before acquiring trace_cmdline_lock.
- * The various trace_arrays' max_lock must be acquired in a context
- * where interrupt is disabled.
- */
-static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
-struct saved_cmdlines_buffer {
- unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
- unsigned *map_cmdline_to_pid;
- unsigned cmdline_num;
- int cmdline_idx;
- char saved_cmdlines[];
-};
-static struct saved_cmdlines_buffer *savedcmd;
-
-static inline char *get_saved_cmdlines(int idx)
-{
- return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];
-}
-
-static inline void set_cmdline(int idx, const char *cmdline)
-{
- strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
-}
-
-static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
-{
- int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN);
-
- kfree(s->map_cmdline_to_pid);
- kmemleak_free(s);
- free_pages((unsigned long)s, order);
-}
-
-static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val)
-{
- struct saved_cmdlines_buffer *s;
- struct page *page;
- int orig_size, size;
- int order;
-
- /* Figure out how much is needed to hold the given number of cmdlines */
- orig_size = sizeof(*s) + val * TASK_COMM_LEN;
- order = get_order(orig_size);
- size = 1 << (order + PAGE_SHIFT);
- page = alloc_pages(GFP_KERNEL, order);
- if (!page)
- return NULL;
-
- s = page_address(page);
- kmemleak_alloc(s, size, 1, GFP_KERNEL);
- memset(s, 0, sizeof(*s));
-
- /* Round up to actual allocation */
- val = (size - sizeof(*s)) / TASK_COMM_LEN;
- s->cmdline_num = val;
-
- s->map_cmdline_to_pid = kmalloc_array(val,
- sizeof(*s->map_cmdline_to_pid),
- GFP_KERNEL);
- if (!s->map_cmdline_to_pid) {
- free_saved_cmdlines_buffer(s);
- return NULL;
- }
-
- s->cmdline_idx = 0;
- memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
- sizeof(s->map_pid_to_cmdline));
- memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
- val * sizeof(*s->map_cmdline_to_pid));
-
- return s;
-}
-
-static int trace_create_savedcmd(void)
-{
- savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT);
-
- return savedcmd ? 0 : -ENOMEM;
-}
-
int is_tracing_stopped(void)
{
return global_trace.stop_count;
@@ -2483,201 +2495,6 @@ void tracing_stop(void)
return tracing_stop_tr(&global_trace);
}
-static int trace_save_cmdline(struct task_struct *tsk)
-{
- unsigned tpid, idx;
-
- /* treat recording of idle task as a success */
- if (!tsk->pid)
- return 1;
-
- tpid = tsk->pid & (PID_MAX_DEFAULT - 1);
-
- /*
- * It's not the end of the world if we don't get
- * the lock, but we also don't want to spin
- * nor do we want to disable interrupts,
- * so if we miss here, then better luck next time.
- *
- * This is called within the scheduler and wake up, so interrupts
- * had better been disabled and run queue lock been held.
- */
- lockdep_assert_preemption_disabled();
- if (!arch_spin_trylock(&trace_cmdline_lock))
- return 0;
-
- idx = savedcmd->map_pid_to_cmdline[tpid];
- if (idx == NO_CMDLINE_MAP) {
- idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;
-
- savedcmd->map_pid_to_cmdline[tpid] = idx;
- savedcmd->cmdline_idx = idx;
- }
-
- savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
- set_cmdline(idx, tsk->comm);
-
- arch_spin_unlock(&trace_cmdline_lock);
-
- return 1;
-}
-
-static void __trace_find_cmdline(int pid, char comm[])
-{
- unsigned map;
- int tpid;
-
- if (!pid) {
- strcpy(comm, "<idle>");
- return;
- }
-
- if (WARN_ON_ONCE(pid < 0)) {
- strcpy(comm, "<XXX>");
- return;
- }
-
- tpid = pid & (PID_MAX_DEFAULT - 1);
- map = savedcmd->map_pid_to_cmdline[tpid];
- if (map != NO_CMDLINE_MAP) {
- tpid = savedcmd->map_cmdline_to_pid[map];
- if (tpid == pid) {
- strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
- return;
- }
- }
- strcpy(comm, "<...>");
-}
-
-void trace_find_cmdline(int pid, char comm[])
-{
- preempt_disable();
- arch_spin_lock(&trace_cmdline_lock);
-
- __trace_find_cmdline(pid, comm);
-
- arch_spin_unlock(&trace_cmdline_lock);
- preempt_enable();
-}
-
-static int *trace_find_tgid_ptr(int pid)
-{
- /*
- * Pairs with the smp_store_release in set_tracer_flag() to ensure that
- * if we observe a non-NULL tgid_map then we also observe the correct
- * tgid_map_max.
- */
- int *map = smp_load_acquire(&tgid_map);
-
- if (unlikely(!map || pid > tgid_map_max))
- return NULL;
-
- return &map[pid];
-}
-
-int trace_find_tgid(int pid)
-{
- int *ptr = trace_find_tgid_ptr(pid);
-
- return ptr ? *ptr : 0;
-}
-
-static int trace_save_tgid(struct task_struct *tsk)
-{
- int *ptr;
-
- /* treat recording of idle task as a success */
- if (!tsk->pid)
- return 1;
-
- ptr = trace_find_tgid_ptr(tsk->pid);
- if (!ptr)
- return 0;
-
- *ptr = tsk->tgid;
- return 1;
-}
-
-static bool tracing_record_taskinfo_skip(int flags)
-{
- if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID))))
- return true;
- if (!__this_cpu_read(trace_taskinfo_save))
- return true;
- return false;
-}
-
-/**
- * tracing_record_taskinfo - record the task info of a task
- *
- * @task: task to record
- * @flags: TRACE_RECORD_CMDLINE for recording comm
- * TRACE_RECORD_TGID for recording tgid
- */
-void tracing_record_taskinfo(struct task_struct *task, int flags)
-{
- bool done;
-
- if (tracing_record_taskinfo_skip(flags))
- return;
-
- /*
- * Record as much task information as possible. If some fail, continue
- * to try to record the others.
- */
- done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task);
- done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task);
-
- /* If recording any information failed, retry again soon. */
- if (!done)
- return;
-
- __this_cpu_write(trace_taskinfo_save, false);
-}
-
-/**
- * tracing_record_taskinfo_sched_switch - record task info for sched_switch
- *
- * @prev: previous task during sched_switch
- * @next: next task during sched_switch
- * @flags: TRACE_RECORD_CMDLINE for recording comm
- * TRACE_RECORD_TGID for recording tgid
- */
-void tracing_record_taskinfo_sched_switch(struct task_struct *prev,
- struct task_struct *next, int flags)
-{
- bool done;
-
- if (tracing_record_taskinfo_skip(flags))
- return;
-
- /*
- * Record as much task information as possible. If some fail, continue
- * to try to record the others.
- */
- done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev);
- done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next);
- done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev);
- done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next);
-
- /* If recording any information failed, retry again soon. */
- if (!done)
- return;
-
- __this_cpu_write(trace_taskinfo_save, false);
-}
-
-/* Helpers to record a specific task information */
-void tracing_record_cmdline(struct task_struct *task)
-{
- tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE);
-}
-
-void tracing_record_tgid(struct task_struct *task)
-{
- tracing_record_taskinfo(task, TRACE_RECORD_TGID);
-}
-
/*
* Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq
* overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function
@@ -2719,6 +2536,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
trace_flags |= TRACE_FLAG_NEED_RESCHED;
if (test_preempt_need_resched())
trace_flags |= TRACE_FLAG_PREEMPT_RESCHED;
+ if (IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY) && tif_test_bit(TIF_NEED_RESCHED_LAZY))
+ trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY;
return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) |
(min_t(unsigned int, migration_disable_value(), 0xf)) << 4;
}
@@ -2965,14 +2784,14 @@ static void output_printk(struct trace_event_buffer *fbuffer)
raw_spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
}
-int tracepoint_printk_sysctl(struct ctl_table *table, int write,
+int tracepoint_printk_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp,
loff_t *ppos)
{
int save_tracepoint_printk;
int ret;
- mutex_lock(&tracepoint_printk_mutex);
+ guard(mutex)(&tracepoint_printk_mutex);
save_tracepoint_printk = tracepoint_printk;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
@@ -2985,16 +2804,13 @@ int tracepoint_printk_sysctl(struct ctl_table *table, int write,
tracepoint_printk = 0;
if (save_tracepoint_printk == tracepoint_printk)
- goto out;
+ return ret;
if (tracepoint_printk)
static_key_enable(&tracepoint_printk_key.key);
else
static_key_disable(&tracepoint_printk_key.key);
- out:
- mutex_unlock(&tracepoint_printk_mutex);
-
return ret;
}
@@ -3064,7 +2880,6 @@ void
trace_function(struct trace_array *tr, unsigned long ip, unsigned long
parent_ip, unsigned int trace_ctx)
{
- struct trace_event_call *call = &event_function;
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct ftrace_entry *entry;
@@ -3077,11 +2892,9 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long
entry->ip = ip;
entry->parent_ip = parent_ip;
- if (!call_filter_check_discard(call, entry, buffer, event)) {
- if (static_branch_unlikely(&trace_function_exports_enabled))
- ftrace_exports(event, TRACE_EXPORT_FUNCTION);
- __buffer_unlock_commit(buffer, event);
- }
+ if (static_branch_unlikely(&trace_function_exports_enabled))
+ ftrace_exports(event, TRACE_EXPORT_FUNCTION);
+ __buffer_unlock_commit(buffer, event);
}
#ifdef CONFIG_STACKTRACE
@@ -3089,7 +2902,7 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long
/* Allow 4 levels of nesting: normal, softirq, irq, NMI */
#define FTRACE_KSTACK_NESTING 4
-#define FTRACE_KSTACK_ENTRIES (PAGE_SIZE / FTRACE_KSTACK_NESTING)
+#define FTRACE_KSTACK_ENTRIES (SZ_4K / FTRACE_KSTACK_NESTING)
struct ftrace_stack {
unsigned long calls[FTRACE_KSTACK_ENTRIES];
@@ -3103,11 +2916,11 @@ struct ftrace_stacks {
static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks);
static DEFINE_PER_CPU(int, ftrace_stack_reserve);
-static void __ftrace_trace_stack(struct trace_buffer *buffer,
+static void __ftrace_trace_stack(struct trace_array *tr,
+ struct trace_buffer *buffer,
unsigned int trace_ctx,
int skip, struct pt_regs *regs)
{
- struct trace_event_call *call = &event_kernel_stack;
struct ring_buffer_event *event;
unsigned int size, nr_entries;
struct ftrace_stack *fstack;
@@ -3150,6 +2963,20 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
nr_entries = stack_trace_save(fstack->calls, size, skip);
}
+#ifdef CONFIG_DYNAMIC_FTRACE
+ /* Mark entry of stack trace as trampoline code */
+ if (tr->ops && tr->ops->trampoline) {
+ unsigned long tramp_start = tr->ops->trampoline;
+ unsigned long tramp_end = tramp_start + tr->ops->trampoline_size;
+ unsigned long *calls = fstack->calls;
+
+ for (int i = 0; i < nr_entries; i++) {
+ if (calls[i] >= tramp_start && calls[i] < tramp_end)
+ calls[i] = FTRACE_TRAMPOLINE_MARKER;
+ }
+ }
+#endif
+
event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
struct_size(entry, caller, nr_entries),
trace_ctx);
@@ -3161,8 +2988,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
memcpy(&entry->caller, fstack->calls,
flex_array_size(entry, caller, nr_entries));
- if (!call_filter_check_discard(call, entry, buffer, event))
- __buffer_unlock_commit(buffer, event);
+ __buffer_unlock_commit(buffer, event);
out:
/* Again, don't let gcc optimize things here */
@@ -3180,7 +3006,7 @@ static inline void ftrace_trace_stack(struct trace_array *tr,
if (!(tr->trace_flags & TRACE_ITER_STACKTRACE))
return;
- __ftrace_trace_stack(buffer, trace_ctx, skip, regs);
+ __ftrace_trace_stack(tr, buffer, trace_ctx, skip, regs);
}
void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
@@ -3189,7 +3015,7 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
struct trace_buffer *buffer = tr->array_buffer.buffer;
if (rcu_is_watching()) {
- __ftrace_trace_stack(buffer, trace_ctx, skip, NULL);
+ __ftrace_trace_stack(tr, buffer, trace_ctx, skip, NULL);
return;
}
@@ -3206,7 +3032,7 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
return;
ct_irq_enter_irqson();
- __ftrace_trace_stack(buffer, trace_ctx, skip, NULL);
+ __ftrace_trace_stack(tr, buffer, trace_ctx, skip, NULL);
ct_irq_exit_irqson();
}
@@ -3223,8 +3049,8 @@ void trace_dump_stack(int skip)
/* Skip 1 to skip this function. */
skip++;
#endif
- __ftrace_trace_stack(global_trace.array_buffer.buffer,
- tracing_gen_ctx(), skip, NULL);
+ __ftrace_trace_stack(printk_trace, printk_trace->array_buffer.buffer,
+ tracing_gen_ctx(), skip, NULL);
}
EXPORT_SYMBOL_GPL(trace_dump_stack);
@@ -3235,7 +3061,6 @@ static void
ftrace_trace_userstack(struct trace_array *tr,
struct trace_buffer *buffer, unsigned int trace_ctx)
{
- struct trace_event_call *call = &event_user_stack;
struct ring_buffer_event *event;
struct userstack_entry *entry;
@@ -3269,8 +3094,7 @@ ftrace_trace_userstack(struct trace_array *tr,
memset(&entry->caller, 0, sizeof(entry->caller));
stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES);
- if (!call_filter_check_discard(call, entry, buffer, event))
- __buffer_unlock_commit(buffer, event);
+ __buffer_unlock_commit(buffer, event);
out_drop_count:
__this_cpu_dec(user_stack_count);
@@ -3439,15 +3263,17 @@ static void trace_printk_start_stop_comm(int enabled)
*/
int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
{
- struct trace_event_call *call = &event_bprint;
struct ring_buffer_event *event;
struct trace_buffer *buffer;
- struct trace_array *tr = &global_trace;
+ struct trace_array *tr = READ_ONCE(printk_trace);
struct bprint_entry *entry;
unsigned int trace_ctx;
char *tbuffer;
int len = 0, size;
+ if (!printk_binsafe(tr))
+ return trace_vprintk(ip, fmt, args);
+
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
@@ -3480,10 +3306,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
entry->fmt = fmt;
memcpy(entry->buf, tbuffer, sizeof(u32) * len);
- if (!call_filter_check_discard(call, entry, buffer, event)) {
- __buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL);
- }
+ __buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL);
out:
ring_buffer_nest_end(buffer);
@@ -3503,7 +3327,6 @@ static int
__trace_array_vprintk(struct trace_buffer *buffer,
unsigned long ip, const char *fmt, va_list args)
{
- struct trace_event_call *call = &event_print;
struct ring_buffer_event *event;
int len = 0, size;
struct print_entry *entry;
@@ -3538,10 +3361,8 @@ __trace_array_vprintk(struct trace_buffer *buffer,
entry->ip = ip;
memcpy(&entry->buf, tbuffer, len + 1);
- if (!call_filter_check_discard(call, entry, buffer, event)) {
- __buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(&global_trace, buffer, trace_ctx, 6, NULL);
- }
+ __buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL);
out:
ring_buffer_nest_end(buffer);
@@ -3636,7 +3457,7 @@ int trace_array_printk_buf(struct trace_buffer *buffer,
int ret;
va_list ap;
- if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
+ if (!(printk_trace->trace_flags & TRACE_ITER_PRINTK))
return 0;
va_start(ap, fmt);
@@ -3648,7 +3469,7 @@ int trace_array_printk_buf(struct trace_buffer *buffer,
__printf(2, 0)
int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
{
- return trace_array_vprintk(&global_trace, ip, fmt, args);
+ return trace_array_vprintk(printk_trace, ip, fmt, args);
}
EXPORT_SYMBOL_GPL(trace_vprintk);
@@ -3771,17 +3592,12 @@ char *trace_iter_expand_format(struct trace_iterator *iter)
}
/* Returns true if the string is safe to dereference from an event */
-static bool trace_safe_str(struct trace_iterator *iter, const char *str,
- bool star, int len)
+static bool trace_safe_str(struct trace_iterator *iter, const char *str)
{
unsigned long addr = (unsigned long)str;
struct trace_event *trace_event;
struct trace_event_call *event;
- /* Ignore strings with no length */
- if (star && !len)
- return true;
-
/* OK if part of the event data */
if ((addr >= (unsigned long)iter->ent) &&
(addr < (unsigned long)iter->ent + iter->ent_size))
@@ -3821,133 +3637,69 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str,
return false;
}
-static DEFINE_STATIC_KEY_FALSE(trace_no_verify);
-
-static int test_can_verify_check(const char *fmt, ...)
-{
- char buf[16];
- va_list ap;
- int ret;
-
- /*
- * The verifier is dependent on vsnprintf() modifies the va_list
- * passed to it, where it is sent as a reference. Some architectures
- * (like x86_32) passes it by value, which means that vsnprintf()
- * does not modify the va_list passed to it, and the verifier
- * would then need to be able to understand all the values that
- * vsnprintf can use. If it is passed by value, then the verifier
- * is disabled.
- */
- va_start(ap, fmt);
- vsnprintf(buf, 16, "%d", ap);
- ret = va_arg(ap, int);
- va_end(ap);
-
- return ret;
-}
-
-static void test_can_verify(void)
-{
- if (!test_can_verify_check("%d %d", 0, 1)) {
- pr_info("trace event string verifier disabled\n");
- static_branch_inc(&trace_no_verify);
- }
-}
-
/**
- * trace_check_vprintf - Check dereferenced strings while writing to the seq buffer
+ * ignore_event - Check dereferenced fields while writing to the seq buffer
* @iter: The iterator that holds the seq buffer and the event being printed
- * @fmt: The format used to print the event
- * @ap: The va_list holding the data to print from @fmt.
*
- * This writes the data into the @iter->seq buffer using the data from
- * @fmt and @ap. If the format has a %s, then the source of the string
- * is examined to make sure it is safe to print, otherwise it will
- * warn and print "[UNSAFE MEMORY]" in place of the dereferenced string
- * pointer.
+ * At boot up, test_event_printk() will flag any event that dereferences
+ * a string with "%s" that does exist in the ring buffer. It may still
+ * be valid, as the string may point to a static string in the kernel
+ * rodata that never gets freed. But if the string pointer is pointing
+ * to something that was allocated, there's a chance that it can be freed
+ * by the time the user reads the trace. This would cause a bad memory
+ * access by the kernel and possibly crash the system.
+ *
+ * This function will check if the event has any fields flagged as needing
+ * to be checked at runtime and perform those checks.
+ *
+ * If it is found that a field is unsafe, it will write into the @iter->seq
+ * a message stating what was found to be unsafe.
+ *
+ * @return: true if the event is unsafe and should be ignored,
+ * false otherwise.
*/
-void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
- va_list ap)
+bool ignore_event(struct trace_iterator *iter)
{
- const char *p = fmt;
- const char *str;
- int i, j;
+ struct ftrace_event_field *field;
+ struct trace_event *trace_event;
+ struct trace_event_call *event;
+ struct list_head *head;
+ struct trace_seq *seq;
+ const void *ptr;
- if (WARN_ON_ONCE(!fmt))
- return;
+ trace_event = ftrace_find_event(iter->ent->type);
- if (static_branch_unlikely(&trace_no_verify))
- goto print;
+ seq = &iter->seq;
- /* Don't bother checking when doing a ftrace_dump() */
- if (iter->fmt == static_fmt_buf)
- goto print;
+ if (!trace_event) {
+ trace_seq_printf(seq, "EVENT ID %d NOT FOUND?\n", iter->ent->type);
+ return true;
+ }
- while (*p) {
- bool star = false;
- int len = 0;
-
- j = 0;
-
- /* We only care about %s and variants */
- for (i = 0; p[i]; i++) {
- if (i + 1 >= iter->fmt_size) {
- /*
- * If we can't expand the copy buffer,
- * just print it.
- */
- if (!trace_iter_expand_format(iter))
- goto print;
- }
+ event = container_of(trace_event, struct trace_event_call, event);
+ if (!(event->flags & TRACE_EVENT_FL_TEST_STR))
+ return false;
- if (p[i] == '\\' && p[i+1]) {
- i++;
- continue;
- }
- if (p[i] == '%') {
- /* Need to test cases like %08.*s */
- for (j = 1; p[i+j]; j++) {
- if (isdigit(p[i+j]) ||
- p[i+j] == '.')
- continue;
- if (p[i+j] == '*') {
- star = true;
- continue;
- }
- break;
- }
- if (p[i+j] == 's')
- break;
- star = false;
- }
- j = 0;
- }
- /* If no %s found then just print normally */
- if (!p[i])
- break;
+ head = trace_get_fields(event);
+ if (!head) {
+ trace_seq_printf(seq, "FIELDS FOR EVENT '%s' NOT FOUND?\n",
+ trace_event_name(event));
+ return true;
+ }
- /* Copy up to the %s, and print that */
- strncpy(iter->fmt, p, i);
- iter->fmt[i] = '\0';
- trace_seq_vprintf(&iter->seq, iter->fmt, ap);
+ /* Offsets are from the iter->ent that points to the raw event */
+ ptr = iter->ent;
- /*
- * If iter->seq is full, the above call no longer guarantees
- * that ap is in sync with fmt processing, and further calls
- * to va_arg() can return wrong positional arguments.
- *
- * Ensure that ap is no longer used in this case.
- */
- if (iter->seq.full) {
- p = "";
- break;
- }
+ list_for_each_entry(field, head, link) {
+ const char *str;
+ bool good;
- if (star)
- len = va_arg(ap, int);
+ if (!field->needs_test)
+ continue;
+
+ str = *(const char **)(ptr + field->offset);
- /* The ap now points to the string data of the %s */
- str = va_arg(ap, const char *);
+ good = trace_safe_str(iter, str);
/*
* If you hit this warning, it is likely that the
@@ -3958,45 +3710,14 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
* instead. See samples/trace_events/trace-events-sample.h
* for reference.
*/
- if (WARN_ONCE(!trace_safe_str(iter, str, star, len),
- "fmt: '%s' current_buffer: '%s'",
- fmt, seq_buf_str(&iter->seq.seq))) {
- int ret;
-
- /* Try to safely read the string */
- if (star) {
- if (len + 1 > iter->fmt_size)
- len = iter->fmt_size - 1;
- if (len < 0)
- len = 0;
- ret = copy_from_kernel_nofault(iter->fmt, str, len);
- iter->fmt[len] = 0;
- star = false;
- } else {
- ret = strncpy_from_kernel_nofault(iter->fmt, str,
- iter->fmt_size);
- }
- if (ret < 0)
- trace_seq_printf(&iter->seq, "(0x%px)", str);
- else
- trace_seq_printf(&iter->seq, "(0x%px:%s)",
- str, iter->fmt);
- str = "[UNSAFE-MEMORY]";
- strcpy(iter->fmt, "%s");
- } else {
- strncpy(iter->fmt, p + i, j + 1);
- iter->fmt[j+1] = '\0';
+ if (WARN_ONCE(!good, "event '%s' has unsafe pointer field '%s'",
+ trace_event_name(event), field->name)) {
+ trace_seq_printf(seq, "EVENT %s: HAS UNSAFE POINTER FIELD '%s'\n",
+ trace_event_name(event), field->name);
+ return true;
}
- if (star)
- trace_seq_printf(&iter->seq, iter->fmt, len, str);
- else
- trace_seq_printf(&iter->seq, iter->fmt, str);
-
- p += i + j + 1;
}
- print:
- if (*p)
- trace_seq_vprintf(&iter->seq, p, ap);
+ return false;
}
const char *trace_event_format(struct trace_iterator *iter, const char *fmt)
@@ -4156,6 +3877,8 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
break;
entries++;
ring_buffer_iter_advance(buf_iter);
+ /* This could be a big loop */
+ cond_resched();
}
per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries;
@@ -4368,7 +4091,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
get_total_entries(buf, &total, &entries);
seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
- name, UTS_RELEASE);
+ name, init_utsname()->release);
seq_puts(m, "# -----------------------------------"
"---------------------------------\n");
seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |"
@@ -4380,6 +4103,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
preempt_model_none() ? "server" :
preempt_model_voluntary() ? "desktop" :
preempt_model_full() ? "preempt" :
+ preempt_model_lazy() ? "lazy" :
preempt_model_rt() ? "preempt_rt" :
"unknown",
/* These are reserved for later use */
@@ -4464,6 +4188,15 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
if (event) {
if (tr->trace_flags & TRACE_ITER_FIELDS)
return print_event_fields(iter, event);
+ /*
+ * For TRACE_EVENT() events, the print_fmt is not
+ * safe to use if the array has delta offsets
+ * Force printing via the fields.
+ */
+ if ((tr->text_delta || tr->data_delta) &&
+ event->type > __TRACE_LAST_TYPE)
+ return print_event_fields(iter, event);
+
return event->funcs->trace(iter, sym_flags, event);
}
@@ -5119,6 +4852,11 @@ static int tracing_open(struct inode *inode, struct file *file)
static bool
trace_ok_for_array(struct tracer *t, struct trace_array *tr)
{
+#ifdef CONFIG_TRACER_SNAPSHOT
+ /* arrays with mapped buffer range do not have snapshots */
+ if (tr->range_addr_start && t->use_max_tr)
+ return false;
+#endif
return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances;
}
@@ -5211,7 +4949,7 @@ static int show_traces_open(struct inode *inode, struct file *file)
return 0;
}
-static int show_traces_release(struct inode *inode, struct file *file)
+static int tracing_seq_release(struct inode *inode, struct file *file)
{
struct trace_array *tr = inode->i_private;
@@ -5252,7 +4990,7 @@ static const struct file_operations show_traces_fops = {
.open = show_traces_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = show_traces_release,
+ .release = tracing_seq_release,
};
static ssize_t
@@ -5331,6 +5069,9 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
cpumask_var_t tracing_cpumask_new;
int err;
+ if (count == 0 || count > KMALLOC_MAX_SIZE)
+ return -EINVAL;
+
if (!zalloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
return -ENOMEM;
@@ -5367,7 +5108,8 @@ static int tracing_trace_options_show(struct seq_file *m, void *v)
u32 tracer_flags;
int i;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
+
tracer_flags = tr->current_trace->flags->val;
trace_opts = tr->current_trace->flags->opts;
@@ -5384,7 +5126,6 @@ static int tracing_trace_options_show(struct seq_file *m, void *v)
else
seq_printf(m, "no%s\n", trace_opts[i].name);
}
- mutex_unlock(&trace_types_lock);
return 0;
}
@@ -5436,10 +5177,9 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
{
- int *map;
-
if ((mask == TRACE_ITER_RECORD_TGID) ||
- (mask == TRACE_ITER_RECORD_CMD))
+ (mask == TRACE_ITER_RECORD_CMD) ||
+ (mask == TRACE_ITER_TRACE_PRINTK))
lockdep_assert_held(&event_mutex);
/* do nothing if flag is already set */
@@ -5451,6 +5191,25 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
if (tr->current_trace->flag_changed(tr, mask, !!enabled))
return -EINVAL;
+ if (mask == TRACE_ITER_TRACE_PRINTK) {
+ if (enabled) {
+ update_printk_trace(tr);
+ } else {
+ /*
+ * The global_trace cannot clear this.
+ * It's flag only gets cleared if another instance sets it.
+ */
+ if (printk_trace == &global_trace)
+ return -EINVAL;
+ /*
+ * An instance must always have it set.
+ * by default, that's the global_trace instane.
+ */
+ if (printk_trace == tr)
+ update_printk_trace(&global_trace);
+ }
+ }
+
if (enabled)
tr->trace_flags |= mask;
else
@@ -5460,20 +5219,8 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
trace_event_enable_cmd_record(enabled);
if (mask == TRACE_ITER_RECORD_TGID) {
- if (!tgid_map) {
- tgid_map_max = pid_max;
- map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
- GFP_KERNEL);
- /*
- * Pairs with smp_load_acquire() in
- * trace_find_tgid_ptr() to ensure that if it observes
- * the tgid_map we just allocated then it also observes
- * the corresponding tgid_map_max value.
- */
- smp_store_release(&tgid_map, map);
- }
- if (!tgid_map) {
+ if (trace_alloc_tgid_map() < 0) {
tr->trace_flags &= ~TRACE_ITER_RECORD_TGID;
return -ENOMEM;
}
@@ -5613,6 +5360,10 @@ static const struct file_operations tracing_iter_fops = {
static const char readme_msg[] =
"tracing mini-HOWTO:\n\n"
+ "By default tracefs removes all OTH file permission bits.\n"
+ "When mounting tracefs an optional group id can be specified\n"
+ "which adds the group to every directory and file in tracefs:\n\n"
+ "\t e.g. mount -t tracefs [-o [gid=<gid>]] nodev /sys/kernel/tracing\n\n"
"# echo 0 > tracing_on : quick way to disable tracing\n"
"# echo 1 > tracing_on : quick way to re-enable tracing\n\n"
" Important files:\n"
@@ -5747,19 +5498,18 @@ static const char readme_msg[] =
"\t args: <name>=fetcharg[:type]\n"
"\t fetcharg: (%<register>|$<efield>), @<address>, @<symbol>[+|-<offset>],\n"
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
-#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
"\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
+#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
"\t <argname>[->field[->field|.field...]],\n"
-#else
- "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
#endif
#else
"\t $stack<index>, $stack, $retval, $comm,\n"
#endif
"\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n"
+ "\t kernel return probes support: $retval, $arg<N>, $comm\n"
"\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n"
"\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n"
- "\t symstr, <type>\\[<array-size>\\]\n"
+ "\t symstr, %pd/%pD, <type>\\[<array-size>\\]\n"
#ifdef CONFIG_HIST_TRIGGERS
"\t field: <stype> <name>;\n"
"\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n"
@@ -5768,6 +5518,8 @@ static const char readme_msg[] =
"\t efield: For event probes ('e' types), the field is on of the fields\n"
"\t of the <attached-group>/<attached-event>.\n"
#endif
+ " set_event\t\t- Enables events by name written into it\n"
+ "\t\t\t Can enable module events via: :mod:<module>\n"
" events/\t\t- Directory containing all trace event subsystems:\n"
" enable\t\t- Write 0/1 to enable/disable tracing of all events\n"
" events/<system>/\t- Directory containing all trace events for <system>:\n"
@@ -5918,207 +5670,6 @@ static const struct file_operations tracing_readme_fops = {
.llseek = generic_file_llseek,
};
-static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos)
-{
- int pid = ++(*pos);
-
- return trace_find_tgid_ptr(pid);
-}
-
-static void *saved_tgids_start(struct seq_file *m, loff_t *pos)
-{
- int pid = *pos;
-
- return trace_find_tgid_ptr(pid);
-}
-
-static void saved_tgids_stop(struct seq_file *m, void *v)
-{
-}
-
-static int saved_tgids_show(struct seq_file *m, void *v)
-{
- int *entry = (int *)v;
- int pid = entry - tgid_map;
- int tgid = *entry;
-
- if (tgid == 0)
- return SEQ_SKIP;
-
- seq_printf(m, "%d %d\n", pid, tgid);
- return 0;
-}
-
-static const struct seq_operations tracing_saved_tgids_seq_ops = {
- .start = saved_tgids_start,
- .stop = saved_tgids_stop,
- .next = saved_tgids_next,
- .show = saved_tgids_show,
-};
-
-static int tracing_saved_tgids_open(struct inode *inode, struct file *filp)
-{
- int ret;
-
- ret = tracing_check_open_get_tr(NULL);
- if (ret)
- return ret;
-
- return seq_open(filp, &tracing_saved_tgids_seq_ops);
-}
-
-
-static const struct file_operations tracing_saved_tgids_fops = {
- .open = tracing_saved_tgids_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
-{
- unsigned int *ptr = v;
-
- if (*pos || m->count)
- ptr++;
-
- (*pos)++;
-
- for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num];
- ptr++) {
- if (*ptr == -1 || *ptr == NO_CMDLINE_MAP)
- continue;
-
- return ptr;
- }
-
- return NULL;
-}
-
-static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos)
-{
- void *v;
- loff_t l = 0;
-
- preempt_disable();
- arch_spin_lock(&trace_cmdline_lock);
-
- v = &savedcmd->map_cmdline_to_pid[0];
- while (l <= *pos) {
- v = saved_cmdlines_next(m, v, &l);
- if (!v)
- return NULL;
- }
-
- return v;
-}
-
-static void saved_cmdlines_stop(struct seq_file *m, void *v)
-{
- arch_spin_unlock(&trace_cmdline_lock);
- preempt_enable();
-}
-
-static int saved_cmdlines_show(struct seq_file *m, void *v)
-{
- char buf[TASK_COMM_LEN];
- unsigned int *pid = v;
-
- __trace_find_cmdline(*pid, buf);
- seq_printf(m, "%d %s\n", *pid, buf);
- return 0;
-}
-
-static const struct seq_operations tracing_saved_cmdlines_seq_ops = {
- .start = saved_cmdlines_start,
- .next = saved_cmdlines_next,
- .stop = saved_cmdlines_stop,
- .show = saved_cmdlines_show,
-};
-
-static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp)
-{
- int ret;
-
- ret = tracing_check_open_get_tr(NULL);
- if (ret)
- return ret;
-
- return seq_open(filp, &tracing_saved_cmdlines_seq_ops);
-}
-
-static const struct file_operations tracing_saved_cmdlines_fops = {
- .open = tracing_saved_cmdlines_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static ssize_t
-tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- char buf[64];
- int r;
-
- preempt_disable();
- arch_spin_lock(&trace_cmdline_lock);
- r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
- arch_spin_unlock(&trace_cmdline_lock);
- preempt_enable();
-
- return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-
-static int tracing_resize_saved_cmdlines(unsigned int val)
-{
- struct saved_cmdlines_buffer *s, *savedcmd_temp;
-
- s = allocate_cmdlines_buffer(val);
- if (!s)
- return -ENOMEM;
-
- preempt_disable();
- arch_spin_lock(&trace_cmdline_lock);
- savedcmd_temp = savedcmd;
- savedcmd = s;
- arch_spin_unlock(&trace_cmdline_lock);
- preempt_enable();
- free_saved_cmdlines_buffer(savedcmd_temp);
-
- return 0;
-}
-
-static ssize_t
-tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- unsigned long val;
- int ret;
-
- ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
- if (ret)
- return ret;
-
- /* must have at least 1 entry or less than PID_MAX_DEFAULT */
- if (!val || val > PID_MAX_DEFAULT)
- return -EINVAL;
-
- ret = tracing_resize_saved_cmdlines((unsigned int)val);
- if (ret < 0)
- return ret;
-
- *ppos += cnt;
-
- return cnt;
-}
-
-static const struct file_operations tracing_saved_cmdlines_size_fops = {
- .open = tracing_open_generic,
- .read = tracing_saved_cmdlines_size_read,
- .write = tracing_saved_cmdlines_size_write,
-};
-
#ifdef CONFIG_TRACE_EVAL_MAP_FILE
static union trace_eval_map_item *
update_eval_map(union trace_eval_map_item *ptr)
@@ -6241,7 +5792,7 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start,
return;
}
- mutex_lock(&trace_eval_mutex);
+ guard(mutex)(&trace_eval_mutex);
if (!trace_eval_maps)
trace_eval_maps = map_array;
@@ -6265,8 +5816,6 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start,
map_array++;
}
memset(map_array, 0, sizeof(*map_array));
-
- mutex_unlock(&trace_eval_mutex);
}
static void trace_create_eval_file(struct dentry *d_tracer)
@@ -6428,28 +5977,34 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
unsigned long size, int cpu_id)
{
- int ret;
-
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
if (cpu_id != RING_BUFFER_ALL_CPUS) {
/* make sure, this cpu is enabled in the mask */
- if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) {
- ret = -EINVAL;
- goto out;
- }
+ if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask))
+ return -EINVAL;
}
- ret = __tracing_resize_ring_buffer(tr, size, cpu_id);
- if (ret < 0)
- ret = -ENOMEM;
+ return __tracing_resize_ring_buffer(tr, size, cpu_id);
+}
-out:
- mutex_unlock(&trace_types_lock);
+static void update_last_data(struct trace_array *tr)
+{
+ if (!tr->text_delta && !tr->data_delta)
+ return;
- return ret;
-}
+ /*
+ * Need to clear all CPU buffers as there cannot be events
+ * from the previous boot mixed with events with this boot
+ * as that will cause a confusing trace. Need to clear all
+ * CPU buffers, even for those that may currently be offline.
+ */
+ tracing_reset_all_cpus(&tr->array_buffer);
+ /* Using current data now */
+ tr->text_delta = 0;
+ tr->data_delta = 0;
+}
/**
* tracing_update_buffers - used by tracing facility to expand ring buffers
@@ -6467,6 +6022,9 @@ int tracing_update_buffers(struct trace_array *tr)
int ret = 0;
mutex_lock(&trace_types_lock);
+
+ update_last_data(tr);
+
if (!tr->ring_buffer_expanded)
ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
@@ -6488,7 +6046,7 @@ static void tracing_set_nop(struct trace_array *tr)
{
if (tr->current_trace == &nop_trace)
return;
-
+
tr->current_trace->enabled--;
if (tr->current_trace->reset)
@@ -6518,15 +6076,17 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
#ifdef CONFIG_TRACER_MAX_TRACE
bool had_max_tr;
#endif
- int ret = 0;
+ int ret;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
+
+ update_last_data(tr);
if (!tr->ring_buffer_expanded) {
ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
if (ret < 0)
- goto out;
+ return ret;
ret = 0;
}
@@ -6534,43 +6094,37 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
if (strcmp(t->name, buf) == 0)
break;
}
- if (!t) {
- ret = -EINVAL;
- goto out;
- }
+ if (!t)
+ return -EINVAL;
+
if (t == tr->current_trace)
- goto out;
+ return 0;
#ifdef CONFIG_TRACER_SNAPSHOT
if (t->use_max_tr) {
local_irq_disable();
arch_spin_lock(&tr->max_lock);
- if (tr->cond_snapshot)
- ret = -EBUSY;
+ ret = tr->cond_snapshot ? -EBUSY : 0;
arch_spin_unlock(&tr->max_lock);
local_irq_enable();
if (ret)
- goto out;
+ return ret;
}
#endif
/* Some tracers won't work on kernel command line */
if (system_state < SYSTEM_RUNNING && t->noboot) {
pr_warn("Tracer '%s' is not allowed on command line, ignored\n",
t->name);
- goto out;
+ return -EINVAL;
}
/* Some tracers are only allowed for the top level buffer */
- if (!trace_ok_for_array(t, tr)) {
- ret = -EINVAL;
- goto out;
- }
+ if (!trace_ok_for_array(t, tr))
+ return -EINVAL;
/* If trace pipe files are being read, we can't change the tracer */
- if (tr->trace_ref) {
- ret = -EBUSY;
- goto out;
- }
+ if (tr->trace_ref)
+ return -EBUSY;
trace_branch_disable();
@@ -6595,12 +6149,13 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
*/
synchronize_rcu();
free_snapshot(tr);
+ tracing_disarm_snapshot(tr);
}
- if (t->use_max_tr && !tr->allocated_snapshot) {
- ret = tracing_alloc_snapshot_instance(tr);
- if (ret < 0)
- goto out;
+ if (!had_max_tr && t->use_max_tr) {
+ ret = tracing_arm_snapshot_locked(tr);
+ if (ret)
+ return ret;
}
#else
tr->current_trace = &nop_trace;
@@ -6608,17 +6163,20 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
if (t->init) {
ret = tracer_init(t, tr);
- if (ret)
- goto out;
+ if (ret) {
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (t->use_max_tr)
+ tracing_disarm_snapshot(tr);
+#endif
+ return ret;
+ }
}
tr->current_trace = t;
tr->current_trace->enabled++;
trace_branch_enable(tr);
- out:
- mutex_unlock(&trace_types_lock);
- return ret;
+ return 0;
}
static ssize_t
@@ -6696,22 +6254,18 @@ tracing_thresh_write(struct file *filp, const char __user *ubuf,
struct trace_array *tr = filp->private_data;
int ret;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos);
if (ret < 0)
- goto out;
+ return ret;
if (tr->current_trace->update_thresh) {
ret = tr->current_trace->update_thresh(tr);
if (ret < 0)
- goto out;
+ return ret;
}
- ret = cnt;
-out:
- mutex_unlock(&trace_types_lock);
-
- return ret;
+ return cnt;
}
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -6930,31 +6484,29 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
* This is just a matter of traces coherency, the ring buffer itself
* is protected.
*/
- mutex_lock(&iter->mutex);
+ guard(mutex)(&iter->mutex);
/* return any leftover data */
sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
if (sret != -EBUSY)
- goto out;
+ return sret;
trace_seq_init(&iter->seq);
if (iter->trace->read) {
sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
if (sret)
- goto out;
+ return sret;
}
waitagain:
sret = tracing_wait_pipe(filp);
if (sret <= 0)
- goto out;
+ return sret;
/* stop when tracing is finished */
- if (trace_empty(iter)) {
- sret = 0;
- goto out;
- }
+ if (trace_empty(iter))
+ return 0;
if (cnt >= TRACE_SEQ_BUFFER_SIZE)
cnt = TRACE_SEQ_BUFFER_SIZE - 1;
@@ -7018,9 +6570,6 @@ waitagain:
if (sret == -EBUSY)
goto waitagain;
-out:
- mutex_unlock(&iter->mutex);
-
return sret;
}
@@ -7264,6 +6813,37 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
}
static ssize_t
+tracing_last_boot_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ struct seq_buf seq;
+ char buf[64];
+
+ seq_buf_init(&seq, buf, 64);
+
+ seq_buf_printf(&seq, "text delta:\t%ld\n", tr->text_delta);
+ seq_buf_printf(&seq, "data delta:\t%ld\n", tr->data_delta);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, seq_buf_used(&seq));
+}
+
+static int tracing_buffer_meta_open(struct inode *inode, struct file *filp)
+{
+ struct trace_array *tr = inode->i_private;
+ int cpu = tracing_get_cpu(inode);
+ int ret;
+
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
+
+ ret = ring_buffer_meta_seq_init(filp, tr->array_buffer.buffer, cpu);
+ if (ret < 0)
+ __trace_array_put(tr);
+ return ret;
+}
+
+static ssize_t
tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
@@ -7581,25 +7161,19 @@ u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_eve
*/
int tracing_set_filter_buffering(struct trace_array *tr, bool set)
{
- int ret = 0;
-
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
if (set && tr->no_filter_buffering_ref++)
- goto out;
+ return 0;
if (!set) {
- if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) {
- ret = -EINVAL;
- goto out;
- }
+ if (WARN_ON_ONCE(!tr->no_filter_buffering_ref))
+ return -EINVAL;
--tr->no_filter_buffering_ref;
}
- out:
- mutex_unlock(&trace_types_lock);
- return ret;
+ return 0;
}
struct ftrace_buffer_info {
@@ -7675,12 +7249,10 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (ret)
return ret;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
- if (tr->current_trace->use_max_tr) {
- ret = -EBUSY;
- goto out;
- }
+ if (tr->current_trace->use_max_tr)
+ return -EBUSY;
local_irq_disable();
arch_spin_lock(&tr->max_lock);
@@ -7689,32 +7261,29 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
arch_spin_unlock(&tr->max_lock);
local_irq_enable();
if (ret)
- goto out;
+ return ret;
switch (val) {
case 0:
- if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
- ret = -EINVAL;
- break;
- }
+ if (iter->cpu_file != RING_BUFFER_ALL_CPUS)
+ return -EINVAL;
if (tr->allocated_snapshot)
free_snapshot(tr);
break;
case 1:
/* Only allow per-cpu swap if the ring buffer supports it */
#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP
- if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
- ret = -EINVAL;
- break;
- }
+ if (iter->cpu_file != RING_BUFFER_ALL_CPUS)
+ return -EINVAL;
#endif
if (tr->allocated_snapshot)
ret = resize_buffer_duplicate_size(&tr->max_buffer,
&tr->array_buffer, iter->cpu_file);
- else
- ret = tracing_alloc_snapshot_instance(tr);
- if (ret < 0)
- break;
+
+ ret = tracing_arm_snapshot_locked(tr);
+ if (ret)
+ return ret;
+
/* Now, we're going to swap */
if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
local_irq_disable();
@@ -7724,6 +7293,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer,
(void *)tr, 1);
}
+ tracing_disarm_snapshot(tr);
break;
default:
if (tr->allocated_snapshot) {
@@ -7739,8 +7309,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
*ppos += cnt;
ret = cnt;
}
-out:
- mutex_unlock(&trace_types_lock);
+
return ret;
}
@@ -7826,7 +7395,6 @@ static const struct file_operations tracing_pipe_fops = {
.read = tracing_read_pipe,
.splice_read = tracing_splice_read_pipe,
.release = tracing_release_pipe,
- .llseek = no_llseek,
};
static const struct file_operations tracing_entries_fops = {
@@ -7837,6 +7405,13 @@ static const struct file_operations tracing_entries_fops = {
.release = tracing_release_generic_tr,
};
+static const struct file_operations tracing_buffer_meta_fops = {
+ .open = tracing_buffer_meta_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = tracing_seq_release,
+};
+
static const struct file_operations tracing_total_entries_fops = {
.open = tracing_open_generic_tr,
.read = tracing_total_entries_read,
@@ -7877,6 +7452,13 @@ static const struct file_operations trace_time_stamp_mode_fops = {
.release = tracing_single_release_tr,
};
+static const struct file_operations last_boot_fops = {
+ .open = tracing_open_generic_tr,
+ .read = tracing_last_boot_read,
+ .llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
+};
+
#ifdef CONFIG_TRACER_SNAPSHOT
static const struct file_operations snapshot_fops = {
.open = tracing_snapshot_open,
@@ -7891,7 +7473,6 @@ static const struct file_operations snapshot_raw_fops = {
.read = tracing_buffers_read,
.release = tracing_buffers_release,
.splice_read = tracing_buffers_splice_read,
- .llseek = no_llseek,
};
#endif /* CONFIG_TRACER_SNAPSHOT */
@@ -8114,12 +7695,11 @@ void tracing_log_err(struct trace_array *tr,
len += sizeof(CMD_PREFIX) + 2 * sizeof("\n") + strlen(cmd) + 1;
- mutex_lock(&tracing_err_log_lock);
+ guard(mutex)(&tracing_err_log_lock);
+
err = get_tracing_log_err(tr, len);
- if (PTR_ERR(err) == -ENOMEM) {
- mutex_unlock(&tracing_err_log_lock);
+ if (PTR_ERR(err) == -ENOMEM)
return;
- }
snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc);
snprintf(err->cmd, len, "\n" CMD_PREFIX "%s\n", cmd);
@@ -8130,7 +7710,6 @@ void tracing_log_err(struct trace_array *tr,
err->info.ts = local_clock();
list_add_tail(&err->list, &tr->err_log);
- mutex_unlock(&tracing_err_log_lock);
}
static void clear_tracing_err_log(struct trace_array *tr)
@@ -8362,7 +7941,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
trace_access_unlock(iter->cpu_file);
if (ret < 0) {
- if (trace_empty(iter)) {
+ if (trace_empty(iter) && !iter->closed) {
if ((filp->f_flags & O_NONBLOCK))
return -EAGAIN;
@@ -8398,9 +7977,9 @@ static int tracing_buffers_flush(struct file *file, fl_owner_t id)
struct ftrace_buffer_info *info = file->private_data;
struct trace_iterator *iter = &info->iter;
- iter->wait_index++;
+ iter->closed = true;
/* Make sure the waiters see the new wait_index */
- smp_wmb();
+ (void)atomic_fetch_inc_release(&iter->wait_index);
ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
@@ -8500,6 +8079,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
.spd_release = buffer_spd_release,
};
struct buffer_ref *ref;
+ bool woken = false;
int page_size;
int entries, i;
ssize_t ret = 0;
@@ -8573,17 +8153,17 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
/* did we read anything? */
if (!spd.nr_pages) {
- long wait_index;
if (ret)
goto out;
+ if (woken)
+ goto out;
+
ret = -EAGAIN;
if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
goto out;
- wait_index = READ_ONCE(iter->wait_index);
-
ret = wait_on_pipe(iter, iter->snapshot ? 0 : iter->tr->buffer_percent);
if (ret)
goto out;
@@ -8592,10 +8172,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
if (!tracer_tracing_is_on(iter->tr))
goto out;
- /* Make sure we see the new wait_index */
- smp_rmb();
- if (wait_index != iter->wait_index)
- goto out;
+ /* Iterate one more time to collect any new data then exit */
+ woken = true;
goto again;
}
@@ -8607,20 +8185,36 @@ out:
return ret;
}
-/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */
static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct ftrace_buffer_info *info = file->private_data;
struct trace_iterator *iter = &info->iter;
+ int err;
+
+ if (cmd == TRACE_MMAP_IOCTL_GET_READER) {
+ if (!(file->f_flags & O_NONBLOCK)) {
+ err = ring_buffer_wait(iter->array_buffer->buffer,
+ iter->cpu_file,
+ iter->tr->buffer_percent,
+ NULL, NULL);
+ if (err)
+ return err;
+ }
- if (cmd)
- return -ENOIOCTLCMD;
+ return ring_buffer_map_get_reader(iter->array_buffer->buffer,
+ iter->cpu_file);
+ } else if (cmd) {
+ return -ENOTTY;
+ }
+ /*
+ * An ioctl call with cmd 0 to the ring buffer file will wake up all
+ * waiters
+ */
mutex_lock(&trace_types_lock);
- iter->wait_index++;
/* Make sure the waiters see the new wait_index */
- smp_wmb();
+ (void)atomic_fetch_inc_release(&iter->wait_index);
ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
@@ -8628,6 +8222,80 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned
return 0;
}
+#ifdef CONFIG_TRACER_MAX_TRACE
+static int get_snapshot_map(struct trace_array *tr)
+{
+ int err = 0;
+
+ /*
+ * Called with mmap_lock held. lockdep would be unhappy if we would now
+ * take trace_types_lock. Instead use the specific
+ * snapshot_trigger_lock.
+ */
+ spin_lock(&tr->snapshot_trigger_lock);
+
+ if (tr->snapshot || tr->mapped == UINT_MAX)
+ err = -EBUSY;
+ else
+ tr->mapped++;
+
+ spin_unlock(&tr->snapshot_trigger_lock);
+
+ /* Wait for update_max_tr() to observe iter->tr->mapped */
+ if (tr->mapped == 1)
+ synchronize_rcu();
+
+ return err;
+
+}
+static void put_snapshot_map(struct trace_array *tr)
+{
+ spin_lock(&tr->snapshot_trigger_lock);
+ if (!WARN_ON(!tr->mapped))
+ tr->mapped--;
+ spin_unlock(&tr->snapshot_trigger_lock);
+}
+#else
+static inline int get_snapshot_map(struct trace_array *tr) { return 0; }
+static inline void put_snapshot_map(struct trace_array *tr) { }
+#endif
+
+static void tracing_buffers_mmap_close(struct vm_area_struct *vma)
+{
+ struct ftrace_buffer_info *info = vma->vm_file->private_data;
+ struct trace_iterator *iter = &info->iter;
+
+ WARN_ON(ring_buffer_unmap(iter->array_buffer->buffer, iter->cpu_file));
+ put_snapshot_map(iter->tr);
+}
+
+static const struct vm_operations_struct tracing_buffers_vmops = {
+ .close = tracing_buffers_mmap_close,
+};
+
+static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct ftrace_buffer_info *info = filp->private_data;
+ struct trace_iterator *iter = &info->iter;
+ int ret = 0;
+
+ /* Currently the boot mapped buffer is not supported for mmap */
+ if (iter->tr->flags & TRACE_ARRAY_FL_BOOT)
+ return -ENODEV;
+
+ ret = get_snapshot_map(iter->tr);
+ if (ret)
+ return ret;
+
+ ret = ring_buffer_map(iter->array_buffer->buffer, iter->cpu_file, vma);
+ if (ret)
+ put_snapshot_map(iter->tr);
+
+ vma->vm_ops = &tracing_buffers_vmops;
+
+ return ret;
+}
+
static const struct file_operations tracing_buffers_fops = {
.open = tracing_buffers_open,
.read = tracing_buffers_read,
@@ -8636,7 +8304,7 @@ static const struct file_operations tracing_buffers_fops = {
.flush = tracing_buffers_flush,
.splice_read = tracing_buffers_splice_read,
.unlocked_ioctl = tracing_buffers_ioctl,
- .llseek = no_llseek,
+ .mmap = tracing_buffers_mmap,
};
static ssize_t
@@ -8720,15 +8388,22 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
char *buf;
int r;
- /* 256 should be plenty to hold the amount needed */
- buf = kmalloc(256, GFP_KERNEL);
+ /* 512 should be plenty to hold the amount needed */
+#define DYN_INFO_BUF_SIZE 512
+
+ buf = kmalloc(DYN_INFO_BUF_SIZE, GFP_KERNEL);
if (!buf)
return -ENOMEM;
- r = scnprintf(buf, 256, "%ld pages:%ld groups: %ld\n",
+ r = scnprintf(buf, DYN_INFO_BUF_SIZE,
+ "%ld pages:%ld groups: %ld\n"
+ "ftrace boot update time = %llu (ns)\n"
+ "ftrace module total update time = %llu (ns)\n",
ftrace_update_tot_cnt,
ftrace_number_of_pages,
- ftrace_number_of_groups);
+ ftrace_number_of_groups,
+ ftrace_update_time,
+ ftrace_total_mod_time);
ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
kfree(buf);
@@ -8857,8 +8532,13 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops;
- if (glob[0] == '!')
- return unregister_ftrace_function_probe_func(glob+1, tr, ops);
+ if (glob[0] == '!') {
+ ret = unregister_ftrace_function_probe_func(glob+1, tr, ops);
+ if (!ret)
+ tracing_disarm_snapshot(tr);
+
+ return ret;
+ }
if (!param)
goto out_reg;
@@ -8877,12 +8557,13 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
return ret;
out_reg:
- ret = tracing_alloc_snapshot_instance(tr);
+ ret = tracing_arm_snapshot(tr);
if (ret < 0)
goto out;
ret = register_ftrace_function_probe(glob, tr, ops, count);
-
+ if (ret < 0)
+ tracing_disarm_snapshot(tr);
out:
return ret < 0 ? ret : 0;
}
@@ -8977,12 +8658,17 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
trace_create_cpu_file("buffer_size_kb", TRACE_MODE_READ, d_cpu,
tr, cpu, &tracing_entries_fops);
+ if (tr->range_addr_start)
+ trace_create_cpu_file("buffer_meta", TRACE_MODE_READ, d_cpu,
+ tr, cpu, &tracing_buffer_meta_fops);
#ifdef CONFIG_TRACER_SNAPSHOT
- trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu,
- tr, cpu, &snapshot_fops);
+ if (!tr->range_addr_start) {
+ trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu,
+ tr, cpu, &snapshot_fops);
- trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu,
- tr, cpu, &snapshot_raw_fops);
+ trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu,
+ tr, cpu, &snapshot_raw_fops);
+ }
#endif
}
@@ -9519,7 +9205,21 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size
buf->tr = tr;
- buf->buffer = ring_buffer_alloc(size, rb_flags);
+ if (tr->range_addr_start && tr->range_addr_size) {
+ buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0,
+ tr->range_addr_start,
+ tr->range_addr_size);
+
+ ring_buffer_last_boot_delta(buf->buffer,
+ &tr->text_delta, &tr->data_delta);
+ /*
+ * This is basically the same as a mapped buffer,
+ * with the same restrictions.
+ */
+ tr->mapped++;
+ } else {
+ buf->buffer = ring_buffer_alloc(size, rb_flags);
+ }
if (!buf->buffer)
return -ENOMEM;
@@ -9556,6 +9256,10 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
return ret;
#ifdef CONFIG_TRACER_MAX_TRACE
+ /* Fix mapped buffer trace arrays do not have snapshot buffers */
+ if (tr->range_addr_start)
+ return 0;
+
ret = allocate_trace_buffer(tr, &tr->max_buffer,
allocate_snapshot ? size : 1);
if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) {
@@ -9656,7 +9360,9 @@ static int trace_array_create_dir(struct trace_array *tr)
}
static struct trace_array *
-trace_array_create_systems(const char *name, const char *systems)
+trace_array_create_systems(const char *name, const char *systems,
+ unsigned long range_addr_start,
+ unsigned long range_addr_size)
{
struct trace_array *tr;
int ret;
@@ -9682,6 +9388,10 @@ trace_array_create_systems(const char *name, const char *systems)
goto out_free_tr;
}
+ /* Only for boot up memory mapped ring buffers */
+ tr->range_addr_start = range_addr_start;
+ tr->range_addr_size = range_addr_size;
+
tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9689,7 +9399,9 @@ trace_array_create_systems(const char *name, const char *systems)
raw_spin_lock_init(&tr->start_lock);
tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
-
+#ifdef CONFIG_TRACER_MAX_TRACE
+ spin_lock_init(&tr->snapshot_trigger_lock);
+#endif
tr->current_trace = &nop_trace;
INIT_LIST_HEAD(&tr->systems);
@@ -9697,6 +9409,10 @@ trace_array_create_systems(const char *name, const char *systems)
INIT_LIST_HEAD(&tr->hist_vars);
INIT_LIST_HEAD(&tr->err_log);
+#ifdef CONFIG_MODULES
+ INIT_LIST_HEAD(&tr->mod_events);
+#endif
+
if (allocate_trace_buffers(tr, trace_buf_size) < 0)
goto out_free_tr;
@@ -9737,7 +9453,7 @@ trace_array_create_systems(const char *name, const char *systems)
static struct trace_array *trace_array_create(const char *name)
{
- return trace_array_create_systems(name, NULL);
+ return trace_array_create_systems(name, NULL, 0, 0);
}
static int instance_mkdir(const char *name)
@@ -9745,23 +9461,45 @@ static int instance_mkdir(const char *name)
struct trace_array *tr;
int ret;
- mutex_lock(&event_mutex);
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&event_mutex);
+ guard(mutex)(&trace_types_lock);
ret = -EEXIST;
if (trace_array_find(name))
- goto out_unlock;
+ return -EEXIST;
tr = trace_array_create(name);
ret = PTR_ERR_OR_ZERO(tr);
-out_unlock:
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
return ret;
}
+static u64 map_pages(u64 start, u64 size)
+{
+ struct page **pages;
+ phys_addr_t page_start;
+ unsigned int page_count;
+ unsigned int i;
+ void *vaddr;
+
+ page_count = DIV_ROUND_UP(size, PAGE_SIZE);
+
+ page_start = start;
+ pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ return 0;
+
+ for (i = 0; i < page_count; i++) {
+ phys_addr_t addr = page_start + i * PAGE_SIZE;
+ pages[i] = pfn_to_page(addr >> PAGE_SHIFT);
+ }
+ vaddr = vmap(pages, page_count, VM_MAP, PAGE_KERNEL);
+ kfree(pages);
+
+ return (u64)(unsigned long)vaddr;
+}
+
/**
* trace_array_get_by_name - Create/Lookup a trace array, given its name.
* @name: The name of the trace array to be looked up/created.
@@ -9783,24 +9521,23 @@ struct trace_array *trace_array_get_by_name(const char *name, const char *system
{
struct trace_array *tr;
- mutex_lock(&event_mutex);
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&event_mutex);
+ guard(mutex)(&trace_types_lock);
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- if (tr->name && strcmp(tr->name, name) == 0)
- goto out_unlock;
+ if (tr->name && strcmp(tr->name, name) == 0) {
+ tr->ref++;
+ return tr;
+ }
}
- tr = trace_array_create_systems(name, systems);
+ tr = trace_array_create_systems(name, systems, 0, 0);
if (IS_ERR(tr))
tr = NULL;
-out_unlock:
- if (tr)
+ else
tr->ref++;
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
return tr;
}
EXPORT_SYMBOL_GPL(trace_array_get_by_name);
@@ -9821,6 +9558,9 @@ static int __remove_instance(struct trace_array *tr)
set_tracer_flag(tr, 1 << i, 0);
}
+ if (printk_trace == tr)
+ update_printk_trace(&global_trace);
+
tracing_set_nop(tr);
clear_ftrace_function_probes(tr);
event_trace_del_tracer(tr);
@@ -9848,48 +9588,36 @@ static int __remove_instance(struct trace_array *tr)
int trace_array_destroy(struct trace_array *this_tr)
{
struct trace_array *tr;
- int ret;
if (!this_tr)
return -EINVAL;
- mutex_lock(&event_mutex);
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&event_mutex);
+ guard(mutex)(&trace_types_lock);
- ret = -ENODEV;
/* Making sure trace array exists before destroying it. */
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- if (tr == this_tr) {
- ret = __remove_instance(tr);
- break;
- }
+ if (tr == this_tr)
+ return __remove_instance(tr);
}
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
-
- return ret;
+ return -ENODEV;
}
EXPORT_SYMBOL_GPL(trace_array_destroy);
static int instance_rmdir(const char *name)
{
struct trace_array *tr;
- int ret;
- mutex_lock(&event_mutex);
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&event_mutex);
+ guard(mutex)(&trace_types_lock);
- ret = -ENODEV;
tr = trace_array_find(name);
- if (tr)
- ret = __remove_instance(tr);
-
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
+ if (!tr)
+ return -ENODEV;
- return ret;
+ return __remove_instance(tr);
}
static __init void create_trace_instances(struct dentry *d_tracer)
@@ -9902,19 +9630,16 @@ static __init void create_trace_instances(struct dentry *d_tracer)
if (MEM_FAIL(!trace_instance_dir, "Failed to create instances directory\n"))
return;
- mutex_lock(&event_mutex);
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&event_mutex);
+ guard(mutex)(&trace_types_lock);
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
if (!tr->name)
continue;
if (MEM_FAIL(trace_array_create_dir(tr) < 0,
"Failed to create instance directory\n"))
- break;
+ return;
}
-
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
}
static void
@@ -9983,10 +9708,15 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
if (ftrace_create_function_files(tr, d_tracer))
MEM_FAIL(1, "Could not allocate function filter files");
+ if (tr->range_addr_start) {
+ trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer,
+ tr, &last_boot_fops);
#ifdef CONFIG_TRACER_SNAPSHOT
- trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer,
- tr, &snapshot_fops);
+ } else {
+ trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer,
+ tr, &snapshot_fops);
#endif
+ }
trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_err_log_fops);
@@ -10099,6 +9829,24 @@ late_initcall_sync(trace_eval_sync);
#ifdef CONFIG_MODULES
+
+bool module_exists(const char *module)
+{
+ /* All modules have the symbol __this_module */
+ static const char this_mod[] = "__this_module";
+ char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2];
+ unsigned long val;
+ int n;
+
+ n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod);
+
+ if (n > sizeof(modname) - 1)
+ return false;
+
+ val = module_kallsyms_lookup_name(modname);
+ return val != 0;
+}
+
static void trace_module_add_evals(struct module *mod)
{
if (!mod->num_trace_evals)
@@ -10123,7 +9871,7 @@ static void trace_module_remove_evals(struct module *mod)
if (!mod->num_trace_evals)
return;
- mutex_lock(&trace_eval_mutex);
+ guard(mutex)(&trace_eval_mutex);
map = trace_eval_maps;
@@ -10135,12 +9883,10 @@ static void trace_module_remove_evals(struct module *mod)
map = map->tail.next;
}
if (!map)
- goto out;
+ return;
*last = trace_eval_jmp_to_tail(map)->tail.next;
kfree(map);
- out:
- mutex_unlock(&trace_eval_mutex);
}
#else
static inline void trace_module_remove_evals(struct module *mod) { }
@@ -10254,14 +10000,14 @@ static struct notifier_block trace_die_notifier = {
static int trace_die_panic_handler(struct notifier_block *self,
unsigned long ev, void *unused)
{
- if (!ftrace_dump_on_oops)
+ if (!ftrace_dump_on_oops_enabled())
return NOTIFY_DONE;
/* The die notifier requires DIE_OOPS to trigger */
if (self == &trace_die_notifier && ev != DIE_OOPS)
return NOTIFY_DONE;
- ftrace_dump(ftrace_dump_on_oops);
+ ftrace_dump(DUMP_PARAM);
return NOTIFY_DONE;
}
@@ -10302,12 +10048,12 @@ trace_printk_seq(struct trace_seq *s)
trace_seq_init(s);
}
-void trace_init_global_iter(struct trace_iterator *iter)
+static void trace_init_iter(struct trace_iterator *iter, struct trace_array *tr)
{
- iter->tr = &global_trace;
+ iter->tr = tr;
iter->trace = iter->tr->current_trace;
iter->cpu_file = RING_BUFFER_ALL_CPUS;
- iter->array_buffer = &global_trace.array_buffer;
+ iter->array_buffer = &tr->array_buffer;
if (iter->trace && iter->trace->open)
iter->trace->open(iter);
@@ -10327,22 +10073,19 @@ void trace_init_global_iter(struct trace_iterator *iter)
iter->fmt_size = STATIC_FMT_BUF_SIZE;
}
-void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
+void trace_init_global_iter(struct trace_iterator *iter)
+{
+ trace_init_iter(iter, &global_trace);
+}
+
+static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_mode)
{
/* use static because iter can be a bit big for the stack */
static struct trace_iterator iter;
- static atomic_t dump_running;
- struct trace_array *tr = &global_trace;
unsigned int old_userobj;
unsigned long flags;
int cnt = 0, cpu;
- /* Only allow one dump user at a time. */
- if (atomic_inc_return(&dump_running) != 1) {
- atomic_dec(&dump_running);
- return;
- }
-
/*
* Always turn off tracing when we dump.
* We don't need to show trace output of what happens
@@ -10351,12 +10094,12 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
* If the user does a sysrq-z, then they can re-enable
* tracing with echo 1 > tracing_on.
*/
- tracing_off();
+ tracer_tracing_off(tr);
local_irq_save(flags);
/* Simulate the iterator */
- trace_init_global_iter(&iter);
+ trace_init_iter(&iter, tr);
for_each_tracing_cpu(cpu) {
atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
@@ -10367,21 +10110,15 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
/* don't look at user memory in panic mode */
tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
- switch (oops_dump_mode) {
- case DUMP_ALL:
- iter.cpu_file = RING_BUFFER_ALL_CPUS;
- break;
- case DUMP_ORIG:
+ if (dump_mode == DUMP_ORIG)
iter.cpu_file = raw_smp_processor_id();
- break;
- case DUMP_NONE:
- goto out_enable;
- default:
- printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
+ else
iter.cpu_file = RING_BUFFER_ALL_CPUS;
- }
- printk(KERN_TRACE "Dumping ftrace buffer:\n");
+ if (tr == &global_trace)
+ printk(KERN_TRACE "Dumping ftrace buffer:\n");
+ else
+ printk(KERN_TRACE "Dumping ftrace instance %s buffer:\n", tr->name);
/* Did function tracer already get disabled? */
if (ftrace_is_dead()) {
@@ -10423,15 +10160,84 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
else
printk(KERN_TRACE "---------------------------------\n");
- out_enable:
tr->trace_flags |= old_userobj;
for_each_tracing_cpu(cpu) {
atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
}
- atomic_dec(&dump_running);
local_irq_restore(flags);
}
+
+static void ftrace_dump_by_param(void)
+{
+ bool first_param = true;
+ char dump_param[MAX_TRACER_SIZE];
+ char *buf, *token, *inst_name;
+ struct trace_array *tr;
+
+ strscpy(dump_param, ftrace_dump_on_oops, MAX_TRACER_SIZE);
+ buf = dump_param;
+
+ while ((token = strsep(&buf, ",")) != NULL) {
+ if (first_param) {
+ first_param = false;
+ if (!strcmp("0", token))
+ continue;
+ else if (!strcmp("1", token)) {
+ ftrace_dump_one(&global_trace, DUMP_ALL);
+ continue;
+ }
+ else if (!strcmp("2", token) ||
+ !strcmp("orig_cpu", token)) {
+ ftrace_dump_one(&global_trace, DUMP_ORIG);
+ continue;
+ }
+ }
+
+ inst_name = strsep(&token, "=");
+ tr = trace_array_find(inst_name);
+ if (!tr) {
+ printk(KERN_TRACE "Instance %s not found\n", inst_name);
+ continue;
+ }
+
+ if (token && (!strcmp("2", token) ||
+ !strcmp("orig_cpu", token)))
+ ftrace_dump_one(tr, DUMP_ORIG);
+ else
+ ftrace_dump_one(tr, DUMP_ALL);
+ }
+}
+
+void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
+{
+ static atomic_t dump_running;
+
+ /* Only allow one dump user at a time. */
+ if (atomic_inc_return(&dump_running) != 1) {
+ atomic_dec(&dump_running);
+ return;
+ }
+
+ switch (oops_dump_mode) {
+ case DUMP_ALL:
+ ftrace_dump_one(&global_trace, DUMP_ALL);
+ break;
+ case DUMP_ORIG:
+ ftrace_dump_one(&global_trace, DUMP_ORIG);
+ break;
+ case DUMP_PARAM:
+ ftrace_dump_by_param();
+ break;
+ case DUMP_NONE:
+ break;
+ default:
+ printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
+ ftrace_dump_one(&global_trace, DUMP_ALL);
+ }
+
+ atomic_dec(&dump_running);
+}
EXPORT_SYMBOL_GPL(ftrace_dump);
#define WRITE_BUFSIZE 4096
@@ -10546,6 +10352,7 @@ __init static void enable_instances(void)
{
struct trace_array *tr;
char *curr_str;
+ char *name;
char *str;
char *tok;
@@ -10554,19 +10361,107 @@ __init static void enable_instances(void)
str = boot_instance_info;
while ((curr_str = strsep(&str, "\t"))) {
+ phys_addr_t start = 0;
+ phys_addr_t size = 0;
+ unsigned long addr = 0;
+ bool traceprintk = false;
+ bool traceoff = false;
+ char *flag_delim;
+ char *addr_delim;
tok = strsep(&curr_str, ",");
- if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
- do_allocate_snapshot(tok);
+ flag_delim = strchr(tok, '^');
+ addr_delim = strchr(tok, '@');
- tr = trace_array_get_by_name(tok, NULL);
- if (!tr) {
- pr_warn("Failed to create instance buffer %s\n", curr_str);
+ if (addr_delim)
+ *addr_delim++ = '\0';
+
+ if (flag_delim)
+ *flag_delim++ = '\0';
+
+ name = tok;
+
+ if (flag_delim) {
+ char *flag;
+
+ while ((flag = strsep(&flag_delim, "^"))) {
+ if (strcmp(flag, "traceoff") == 0) {
+ traceoff = true;
+ } else if ((strcmp(flag, "printk") == 0) ||
+ (strcmp(flag, "traceprintk") == 0) ||
+ (strcmp(flag, "trace_printk") == 0)) {
+ traceprintk = true;
+ } else {
+ pr_info("Tracing: Invalid instance flag '%s' for %s\n",
+ flag, name);
+ }
+ }
+ }
+
+ tok = addr_delim;
+ if (tok && isdigit(*tok)) {
+ start = memparse(tok, &tok);
+ if (!start) {
+ pr_warn("Tracing: Invalid boot instance address for %s\n",
+ name);
+ continue;
+ }
+ if (*tok != ':') {
+ pr_warn("Tracing: No size specified for instance %s\n", name);
+ continue;
+ }
+ tok++;
+ size = memparse(tok, &tok);
+ if (!size) {
+ pr_warn("Tracing: Invalid boot instance size for %s\n",
+ name);
+ continue;
+ }
+ } else if (tok) {
+ if (!reserve_mem_find_by_name(tok, &start, &size)) {
+ start = 0;
+ pr_warn("Failed to map boot instance %s to %s\n", name, tok);
+ continue;
+ }
+ }
+
+ if (start) {
+ addr = map_pages(start, size);
+ if (addr) {
+ pr_info("Tracing: mapped boot instance %s at physical memory %pa of size 0x%lx\n",
+ name, &start, (unsigned long)size);
+ } else {
+ pr_warn("Tracing: Failed to map boot instance %s\n", name);
+ continue;
+ }
+ } else {
+ /* Only non mapped buffers have snapshot buffers */
+ if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
+ do_allocate_snapshot(name);
+ }
+
+ tr = trace_array_create_systems(name, NULL, addr, size);
+ if (IS_ERR(tr)) {
+ pr_warn("Tracing: Failed to create instance buffer %s\n", curr_str);
continue;
}
- /* Allow user space to delete it */
- trace_array_put(tr);
+
+ if (traceoff)
+ tracer_tracing_off(tr);
+
+ if (traceprintk)
+ update_printk_trace(tr);
+
+ /*
+ * If start is set, then this is a mapped buffer, and
+ * cannot be deleted by user space, so keep the reference
+ * to it.
+ */
+ if (start) {
+ tr->flags |= TRACE_ARRAY_FL_BOOT;
+ tr->ref++;
+ }
while ((tok = strsep(&curr_str, ","))) {
early_enable_events(tr, tok, true);
@@ -10659,9 +10554,15 @@ __init static int tracer_alloc_buffers(void)
global_trace.current_trace = &nop_trace;
global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
-
+#ifdef CONFIG_TRACER_MAX_TRACE
+ spin_lock_init(&global_trace.snapshot_trigger_lock);
+#endif
ftrace_init_global_array_ops(&global_trace);
+#ifdef CONFIG_MODULES
+ INIT_LIST_HEAD(&global_trace.mod_events);
+#endif
+
init_trace_flags_index(&global_trace);
register_tracer(&nop_trace);
@@ -10689,14 +10590,12 @@ __init static int tracer_alloc_buffers(void)
register_snapshot_cmd();
- test_can_verify();
-
return 0;
out_free_pipe_cpumask:
free_cpumask_var(global_trace.pipe_cpumask);
out_free_savedcmd:
- free_saved_cmdlines_buffer(savedcmd);
+ trace_free_saved_cmdlines_buffer();
out_free_temp_buffer:
ring_buffer_free(temp_buffer);
out_rm_hp_state:
@@ -10709,6 +10608,14 @@ out:
return ret;
}
+#ifdef CONFIG_FUNCTION_TRACER
+/* Used to set module cached ftrace filtering at boot up */
+__init struct trace_array *trace_get_global_array(void)
+{
+ return &global_trace;
+}
+#endif
+
void __init ftrace_boot_snapshot(void)
{
#ifdef CONFIG_TRACER_MAX_TRACE
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 00f873910c5d..9c21ba45b7af 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -46,6 +46,7 @@ enum trace_type {
TRACE_BRANCH,
TRACE_GRAPH_RET,
TRACE_GRAPH_ENT,
+ TRACE_GRAPH_RETADDR_ENT,
TRACE_USER_STACK,
TRACE_BLK,
TRACE_BPUTS,
@@ -334,8 +335,8 @@ struct trace_array {
*/
struct array_buffer max_buffer;
bool allocated_snapshot;
-#endif
-#ifdef CONFIG_TRACER_MAX_TRACE
+ spinlock_t snapshot_trigger_lock;
+ unsigned int snapshot;
unsigned long max_latency;
#ifdef CONFIG_FSNOTIFY
struct dentry *d_max_latency;
@@ -343,6 +344,13 @@ struct trace_array {
struct irq_work fsnotify_irqwork;
#endif
#endif
+ /* The below is for memory mapped ring buffer */
+ unsigned int mapped;
+ unsigned long range_addr_start;
+ unsigned long range_addr_size;
+ long text_delta;
+ long data_delta;
+
struct trace_pid_list __rcu *filtered_pids;
struct trace_pid_list __rcu *filtered_no_pids;
/*
@@ -392,10 +400,16 @@ struct trace_array {
cpumask_var_t pipe_cpumask;
int ref;
int trace_ref;
+#ifdef CONFIG_MODULES
+ struct list_head mod_events;
+#endif
#ifdef CONFIG_FUNCTION_TRACER
struct ftrace_ops *ops;
struct trace_pid_list __rcu *function_pids;
struct trace_pid_list __rcu *function_no_pids;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ struct fgraph_ops *gops;
+#endif
#ifdef CONFIG_DYNAMIC_FTRACE
/* All of these are protected by the ftrace_lock */
struct list_head func_probes;
@@ -419,9 +433,20 @@ struct trace_array {
};
enum {
- TRACE_ARRAY_FL_GLOBAL = (1 << 0)
+ TRACE_ARRAY_FL_GLOBAL = BIT(0),
+ TRACE_ARRAY_FL_BOOT = BIT(1),
+ TRACE_ARRAY_FL_MOD_INIT = BIT(2),
};
+#ifdef CONFIG_MODULES
+bool module_exists(const char *module);
+#else
+static inline bool module_exists(const char *module)
+{
+ return false;
+}
+#endif
+
extern struct list_head ftrace_trace_arrays;
extern struct mutex trace_types_lock;
@@ -501,6 +526,8 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \
TRACE_GRAPH_ENT); \
+ IF_ASSIGN(var, ent, struct fgraph_retaddr_ent_entry,\
+ TRACE_GRAPH_RETADDR_ENT); \
IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
TRACE_GRAPH_RET); \
IF_ASSIGN(var, ent, struct func_repeats_entry, \
@@ -640,6 +667,8 @@ trace_buffer_lock_reserve(struct trace_buffer *buffer,
unsigned long len,
unsigned int trace_ctx);
+int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, int cpu);
+
struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
struct trace_array_cpu *data);
@@ -651,9 +680,8 @@ void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
bool trace_is_tracepoint_string(const char *str);
const char *trace_event_format(struct trace_iterator *iter, const char *fmt);
-void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
- va_list ap) __printf(2, 0);
char *trace_iter_expand_format(struct trace_iterator *iter);
+bool ignore_event(struct trace_iterator *iter);
int trace_empty(struct trace_iterator *iter);
@@ -678,9 +706,10 @@ void trace_latency_header(struct seq_file *m);
void trace_default_header(struct seq_file *m);
void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
-void trace_graph_return(struct ftrace_graph_ret *trace);
-int trace_graph_entry(struct ftrace_graph_ent *trace);
-void set_graph_array(struct trace_array *tr);
+void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops,
+ struct ftrace_regs *fregs);
+int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
+ struct ftrace_regs *fregs);
void tracing_start_cmdline_record(void);
void tracing_stop_cmdline_record(void);
@@ -703,8 +732,6 @@ extern unsigned long tracing_thresh;
/* PID filtering */
-extern int pid_max;
-
bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
pid_t search_pid);
bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
@@ -760,6 +787,8 @@ extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
extern unsigned long ftrace_update_tot_cnt;
extern unsigned long ftrace_number_of_pages;
extern unsigned long ftrace_number_of_groups;
+extern u64 ftrace_update_time;
+extern u64 ftrace_total_mod_time;
void ftrace_init_trace_array(struct trace_array *tr);
#else
static inline void ftrace_init_trace_array(struct trace_array *tr) { }
@@ -867,6 +896,7 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
#define TRACE_GRAPH_GRAPH_TIME 0x400
#define TRACE_GRAPH_PRINT_RETVAL 0x800
#define TRACE_GRAPH_PRINT_RETVAL_HEX 0x1000
+#define TRACE_GRAPH_PRINT_RETADDR 0x2000
#define TRACE_GRAPH_PRINT_FILL_SHIFT 28
#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
@@ -888,15 +918,68 @@ extern void graph_trace_close(struct trace_iterator *iter);
extern int __trace_graph_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
unsigned int trace_ctx);
+extern int __trace_graph_retaddr_entry(struct trace_array *tr,
+ struct ftrace_graph_ent *trace,
+ unsigned int trace_ctx,
+ unsigned long retaddr);
extern void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret *trace,
- unsigned int trace_ctx);
+ unsigned int trace_ctx,
+ u64 calltime, u64 rettime);
+
+extern void init_array_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops);
+extern int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops);
+extern void free_fgraph_ops(struct trace_array *tr);
+
+enum {
+ TRACE_GRAPH_FL = 1,
+
+ /*
+ * In the very unlikely case that an interrupt came in
+ * at a start of graph tracing, and we want to trace
+ * the function in that interrupt, the depth can be greater
+ * than zero, because of the preempted start of a previous
+ * trace. In an even more unlikely case, depth could be 2
+ * if a softirq interrupted the start of graph tracing,
+ * followed by an interrupt preempting a start of graph
+ * tracing in the softirq, and depth can even be 3
+ * if an NMI came in at the start of an interrupt function
+ * that preempted a softirq start of a function that
+ * preempted normal context!!!! Luckily, it can't be
+ * greater than 3, so the next two bits are a mask
+ * of what the depth is when we set TRACE_GRAPH_FL
+ */
+
+ TRACE_GRAPH_DEPTH_START_BIT,
+ TRACE_GRAPH_DEPTH_END_BIT,
+
+ /*
+ * To implement set_graph_notrace, if this bit is set, we ignore
+ * function graph tracing of called functions, until the return
+ * function is called to clear it.
+ */
+ TRACE_GRAPH_NOTRACE_BIT,
+};
+
+#define TRACE_GRAPH_NOTRACE (1 << TRACE_GRAPH_NOTRACE_BIT)
+
+static inline unsigned long ftrace_graph_depth(unsigned long *task_var)
+{
+ return (*task_var >> TRACE_GRAPH_DEPTH_START_BIT) & 3;
+}
+
+static inline void ftrace_graph_set_depth(unsigned long *task_var, int depth)
+{
+ *task_var &= ~(3 << TRACE_GRAPH_DEPTH_START_BIT);
+ *task_var |= (depth & 3) << TRACE_GRAPH_DEPTH_START_BIT;
+}
#ifdef CONFIG_DYNAMIC_FTRACE
extern struct ftrace_hash __rcu *ftrace_graph_hash;
extern struct ftrace_hash __rcu *ftrace_graph_notrace_hash;
-static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
+static inline int
+ftrace_graph_addr(unsigned long *task_var, struct ftrace_graph_ent *trace)
{
unsigned long addr = trace->func;
int ret = 0;
@@ -918,13 +1001,12 @@ static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
}
if (ftrace_lookup_ip(hash, addr)) {
-
/*
* This needs to be cleared on the return functions
* when the depth is zero.
*/
- trace_recursion_set(TRACE_GRAPH_BIT);
- trace_recursion_set_depth(trace->depth);
+ *task_var |= TRACE_GRAPH_FL;
+ ftrace_graph_set_depth(task_var, trace->depth);
/*
* If no irqs are to be traced, but a set_graph_function
@@ -943,11 +1025,14 @@ out:
return ret;
}
-static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace)
+static inline void
+ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftrace_graph_ret *trace)
{
- if (trace_recursion_test(TRACE_GRAPH_BIT) &&
- trace->depth == trace_recursion_depth())
- trace_recursion_clear(TRACE_GRAPH_BIT);
+ unsigned long *task_var = fgraph_get_task_var(gops);
+
+ if ((*task_var & TRACE_GRAPH_FL) &&
+ trace->depth == ftrace_graph_depth(task_var))
+ *task_var &= ~TRACE_GRAPH_FL;
}
static inline int ftrace_graph_notrace_addr(unsigned long addr)
@@ -973,7 +1058,7 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr)
return ret;
}
#else
-static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
+static inline int ftrace_graph_addr(unsigned long *task_var, struct ftrace_graph_ent *trace)
{
return 1;
}
@@ -982,27 +1067,38 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr)
{
return 0;
}
-static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace)
+static inline void ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftrace_graph_ret *trace)
{ }
#endif /* CONFIG_DYNAMIC_FTRACE */
extern unsigned int fgraph_max_depth;
+extern bool fgraph_sleep_time;
-static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace)
+static inline bool
+ftrace_graph_ignore_func(struct fgraph_ops *gops, struct ftrace_graph_ent *trace)
{
+ unsigned long *task_var = fgraph_get_task_var(gops);
+
/* trace it when it is-nested-in or is a function enabled. */
- return !(trace_recursion_test(TRACE_GRAPH_BIT) ||
- ftrace_graph_addr(trace)) ||
+ return !((*task_var & TRACE_GRAPH_FL) ||
+ ftrace_graph_addr(task_var, trace)) ||
(trace->depth < 0) ||
(fgraph_max_depth && trace->depth >= fgraph_max_depth);
}
+void fgraph_init_ops(struct ftrace_ops *dst_ops,
+ struct ftrace_ops *src_ops);
+
#else /* CONFIG_FUNCTION_GRAPH_TRACER */
static inline enum print_line_t
print_graph_function_flags(struct trace_iterator *iter, u32 flags)
{
return TRACE_TYPE_UNHANDLED;
}
+static inline void free_fgraph_ops(struct trace_array *tr) { }
+/* ftrace_ops may not be defined */
+#define init_array_fgraph_ops(tr, ops) do { } while (0)
+#define allocate_fgraph_ops(tr, ops) ({ 0; })
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
extern struct list_head ftrace_pids;
@@ -1033,6 +1129,7 @@ void ftrace_destroy_function_files(struct trace_array *tr);
int ftrace_allocate_ftrace_ops(struct trace_array *tr);
void ftrace_free_ftrace_ops(struct trace_array *tr);
void ftrace_init_global_array_ops(struct trace_array *tr);
+struct trace_array *trace_get_global_array(void);
void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
void ftrace_reset_array_ops(struct trace_array *tr);
void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer);
@@ -1250,6 +1347,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
C(IRQ_INFO, "irq-info"), \
C(MARKERS, "markers"), \
C(EVENT_FORK, "event-fork"), \
+ C(TRACE_PRINTK, "trace_printk_dest"), \
C(PAUSE_ON_TRACE, "pause-on-trace"), \
C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \
FUNCTION_FLAGS \
@@ -1330,7 +1428,8 @@ struct ftrace_event_field {
int filter_type;
int offset;
int size;
- int is_signed;
+ unsigned int is_signed:1;
+ unsigned int needs_test:1;
int len;
};
@@ -1357,10 +1456,6 @@ struct trace_subsystem_dir {
int nr_events;
};
-extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
- struct trace_buffer *buffer,
- struct ring_buffer_event *event);
-
void trace_buffer_unlock_commit_regs(struct trace_array *tr,
struct trace_buffer *buffer,
struct ring_buffer_event *event,
@@ -1375,6 +1470,16 @@ static inline void trace_buffer_unlock_commit(struct trace_array *tr,
trace_buffer_unlock_commit_regs(tr, buffer, event, trace_ctx, NULL);
}
+DECLARE_PER_CPU(bool, trace_taskinfo_save);
+int trace_save_cmdline(struct task_struct *tsk);
+int trace_create_savedcmd(void);
+int trace_alloc_tgid_map(void);
+void trace_free_saved_cmdlines_buffer(void);
+
+extern const struct file_operations tracing_saved_cmdlines_fops;
+extern const struct file_operations tracing_saved_tgids_fops;
+extern const struct file_operations tracing_saved_cmdlines_size_fops;
+
DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
DECLARE_PER_CPU(int, trace_buffered_event_cnt);
void trace_buffered_event_disable(void);
@@ -1562,6 +1667,29 @@ static inline void *event_file_data(struct file *filp)
extern struct mutex event_mutex;
extern struct list_head ftrace_events;
+/*
+ * When the trace_event_file is the filp->i_private pointer,
+ * it must be taken under the event_mutex lock, and then checked
+ * if the EVENT_FILE_FL_FREED flag is set. If it is, then the
+ * data pointed to by the trace_event_file can not be trusted.
+ *
+ * Use the event_file_file() to access the trace_event_file from
+ * the filp the first time under the event_mutex and check for
+ * NULL. If it is needed to be retrieved again and the event_mutex
+ * is still held, then the event_file_data() can be used and it
+ * is guaranteed to be valid.
+ */
+static inline struct trace_event_file *event_file_file(struct file *filp)
+{
+ struct trace_event_file *file;
+
+ lockdep_assert_held(&event_mutex);
+ file = READ_ONCE(file_inode(filp)->i_private);
+ if (!file || file->flags & EVENT_FILE_FL_FREED)
+ return NULL;
+ return file;
+}
+
extern const struct file_operations event_trigger_fops;
extern const struct file_operations event_hist_fops;
extern const struct file_operations event_hist_debug_fops;
@@ -1973,12 +2101,16 @@ static inline void trace_event_eval_update(struct trace_eval_map **map, int len)
#ifdef CONFIG_TRACER_SNAPSHOT
void tracing_snapshot_instance(struct trace_array *tr);
int tracing_alloc_snapshot_instance(struct trace_array *tr);
+int tracing_arm_snapshot(struct trace_array *tr);
+void tracing_disarm_snapshot(struct trace_array *tr);
#else
static inline void tracing_snapshot_instance(struct trace_array *tr) { }
static inline int tracing_alloc_snapshot_instance(struct trace_array *tr)
{
return 0;
}
+static inline int tracing_arm_snapshot(struct trace_array *tr) { return 0; }
+static inline void tracing_disarm_snapshot(struct trace_array *tr) { }
#endif
#ifdef CONFIG_PREEMPT_TRACER
@@ -2067,4 +2199,11 @@ static inline int rv_init_interface(void)
}
#endif
+/*
+ * This is used only to distinguish
+ * function address from trampoline code.
+ * So this value has no meaning.
+ */
+#define FTRACE_TRAMPOLINE_MARKER ((unsigned long) INT_MAX)
+
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 54d5fa35c90a..e19c32f2a938 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -92,7 +92,6 @@ static void trace_do_benchmark(void)
bm_total += delta;
bm_totalsq += delta * delta;
-
if (bm_cnt > 1) {
/*
* Apply Welford's method to calculate standard deviation:
@@ -105,7 +104,7 @@ static void trace_do_benchmark(void)
stddev = 0;
delta = bm_total;
- do_div(delta, bm_cnt);
+ do_div(delta, (u32)bm_cnt);
avg = delta;
if (stddev > 0) {
@@ -127,7 +126,7 @@ static void trace_do_benchmark(void)
seed = stddev;
if (!last_seed)
break;
- do_div(seed, last_seed);
+ seed = div64_u64(seed, last_seed);
seed += last_seed;
do_div(seed, 2);
} while (i++ < 10 && last_seed != seed);
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index e47fdb4c92fb..6d08a5523ce0 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -30,7 +30,6 @@ static struct trace_array *branch_tracer;
static void
probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
{
- struct trace_event_call *call = &event_branch;
struct trace_array *tr = branch_tracer;
struct trace_buffer *buffer;
struct trace_array_cpu *data;
@@ -74,16 +73,13 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
p--;
p++;
- strncpy(entry->func, f->data.func, TRACE_FUNC_SIZE);
- strncpy(entry->file, p, TRACE_FILE_SIZE);
- entry->func[TRACE_FUNC_SIZE] = 0;
- entry->file[TRACE_FILE_SIZE] = 0;
+ strscpy(entry->func, f->data.func);
+ strscpy(entry->file, p);
entry->constant = f->constant;
entry->line = f->data.line;
entry->correct = val == expect;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit_nostack(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
out:
current->trace_recursion &= ~TRACE_BRANCH_BIT;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 4702efb00ff2..4cb2ebc439be 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -154,5 +154,5 @@ static atomic64_t trace_counter;
*/
u64 notrace trace_clock_counter(void)
{
- return atomic64_add_return(1, &trace_counter);
+ return atomic64_inc_return(&trace_counter);
}
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index 4376887e0d8a..a322e4f249a5 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -74,24 +74,19 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type
struct dyn_event *pos, *n;
char *system = NULL, *event, *p;
int argc, ret = -ENOENT;
- char **argv;
+ char **argv __free(argv_free) = argv_split(GFP_KERNEL, raw_command, &argc);
- argv = argv_split(GFP_KERNEL, raw_command, &argc);
if (!argv)
return -ENOMEM;
if (argv[0][0] == '-') {
- if (argv[0][1] != ':') {
- ret = -EINVAL;
- goto out;
- }
+ if (argv[0][1] != ':')
+ return -EINVAL;
event = &argv[0][2];
} else {
event = strchr(argv[0], ':');
- if (!event) {
- ret = -EINVAL;
- goto out;
- }
+ if (!event)
+ return -EINVAL;
event++;
}
@@ -101,10 +96,8 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type
event = p + 1;
*p = '\0';
}
- if (!system && event[0] == '\0') {
- ret = -EINVAL;
- goto out;
- }
+ if (!system && event[0] == '\0')
+ return -EINVAL;
mutex_lock(&event_mutex);
for_each_dyn_event_safe(pos, n) {
@@ -120,8 +113,6 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type
}
tracing_reset_all_online_cpus();
mutex_unlock(&event_mutex);
-out:
- argv_free(argv);
return ret;
}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c47422b20908..fbfb396905a6 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -85,9 +85,35 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth)
);
-/* Function return entry */
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+
+/* Function call entry with a return address */
+FTRACE_ENTRY_PACKED(fgraph_retaddr_entry, fgraph_retaddr_ent_entry,
+
+ TRACE_GRAPH_RETADDR_ENT,
+
+ F_STRUCT(
+ __field_struct( struct fgraph_retaddr_ent, graph_ent )
+ __field_packed( unsigned long, graph_ent, func )
+ __field_packed( int, graph_ent, depth )
+ __field_packed( unsigned long, graph_ent, retaddr )
+ ),
+
+ F_printk("--> %ps (%d) <- %ps", (void *)__entry->func, __entry->depth,
+ (void *)__entry->retaddr)
+);
+
+#else
+
+#ifndef fgraph_retaddr_ent_entry
+#define fgraph_retaddr_ent_entry ftrace_graph_ent_entry
+#endif
+
+#endif
+
#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
+/* Function return entry */
FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
TRACE_GRAPH_RET,
@@ -98,8 +124,8 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
__field_packed( unsigned long, ret, retval )
__field_packed( int, ret, depth )
__field_packed( unsigned int, ret, overrun )
- __field_packed( unsigned long long, ret, calltime)
- __field_packed( unsigned long long, ret, rettime )
+ __field(unsigned long long, calltime )
+ __field(unsigned long long, rettime )
),
F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d retval: %lx",
@@ -110,6 +136,7 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
#else
+/* Function return entry */
FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
TRACE_GRAPH_RET,
@@ -119,8 +146,8 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
__field_packed( unsigned long, ret, func )
__field_packed( int, ret, depth )
__field_packed( unsigned int, ret, overrun )
- __field_packed( unsigned long long, ret, calltime)
- __field_packed( unsigned long long, ret, rettime )
+ __field(unsigned long long, calltime )
+ __field(unsigned long long, rettime )
),
F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d",
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 03c851f57969..82fd637cfc19 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -220,7 +220,7 @@ static struct trace_eprobe *alloc_event_probe(const char *group,
if (!ep->event_system)
goto error;
- ret = trace_probe_init(&ep->tp, this_event, group, false);
+ ret = trace_probe_init(&ep->tp, this_event, group, false, nargs);
if (ret < 0)
goto error;
@@ -390,8 +390,8 @@ static int get_eprobe_size(struct trace_probe *tp, void *rec)
/* Note that we don't verify it, since the code does not come from user space */
static int
-process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
- void *base)
+process_fetch_insn(struct fetch_insn *code, void *rec, void *edata,
+ void *dest, void *base)
{
unsigned long val;
int ret;
@@ -438,7 +438,7 @@ __eprobe_trace_func(struct eprobe_data *edata, void *rec)
return;
entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
- store_trace_args(&entry[1], &edata->ep->tp, rec, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &edata->ep->tp, rec, NULL, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
}
@@ -912,10 +912,15 @@ static int __trace_eprobe_create(int argc, const char *argv[])
}
}
- mutex_lock(&event_mutex);
- event_call = find_and_get_event(sys_name, sys_event);
- ep = alloc_event_probe(group, event, event_call, argc - 2);
- mutex_unlock(&event_mutex);
+ if (argc - 2 > MAX_TRACE_ARGS) {
+ ret = -E2BIG;
+ goto error;
+ }
+
+ scoped_guard(mutex, &event_mutex) {
+ event_call = find_and_get_event(sys_name, sys_event);
+ ep = alloc_event_probe(group, event, event_call, argc - 2);
+ }
if (IS_ERR(ep)) {
ret = PTR_ERR(ep);
@@ -937,7 +942,7 @@ static int __trace_eprobe_create(int argc, const char *argv[])
argc -= 2; argv += 2;
/* parse arguments */
- for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ for (i = 0; i < argc; i++) {
trace_probe_log_set_index(i + 2);
ret = trace_eprobe_tp_update_arg(ep, argv, i);
if (ret)
@@ -947,18 +952,21 @@ static int __trace_eprobe_create(int argc, const char *argv[])
if (ret < 0)
goto error;
init_trace_eprobe_call(ep);
- mutex_lock(&event_mutex);
- ret = trace_probe_register_event_call(&ep->tp);
- if (ret) {
- if (ret == -EEXIST) {
- trace_probe_log_set_index(0);
- trace_probe_log_err(0, EVENT_EXIST);
+ scoped_guard(mutex, &event_mutex) {
+ ret = trace_probe_register_event_call(&ep->tp);
+ if (ret) {
+ if (ret == -EEXIST) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, EVENT_EXIST);
+ }
+ goto error;
+ }
+ ret = dyn_event_add(&ep->devent, &ep->tp.event->call);
+ if (ret < 0) {
+ trace_probe_unregister_event_call(&ep->tp);
+ goto error;
}
- mutex_unlock(&event_mutex);
- goto error;
}
- ret = dyn_event_add(&ep->devent, &ep->tp.event->call);
- mutex_unlock(&event_mutex);
return ret;
parse_error:
ret = -EINVAL;
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 05e791241812..3ff9caa4a71b 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -352,10 +352,16 @@ void perf_uprobe_destroy(struct perf_event *p_event)
int perf_trace_add(struct perf_event *p_event, int flags)
{
struct trace_event_call *tp_event = p_event->tp_event;
+ struct hw_perf_event *hwc = &p_event->hw;
if (!(flags & PERF_EF_START))
p_event->hw.state = PERF_HES_STOPPED;
+ if (is_sampling_event(p_event)) {
+ hwc->last_period = hwc->sample_period;
+ perf_swevent_set_period(p_event);
+ }
+
/*
* If TRACE_REG_PERF_ADD returns false; no custom action was performed
* and we need to take the default action of enqueueing our event on
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 7c364b87352e..513de9ceb80e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -82,7 +82,7 @@ static int system_refcount_dec(struct event_subsystem *system)
}
static struct ftrace_event_field *
-__find_event_field(struct list_head *head, char *name)
+__find_event_field(struct list_head *head, const char *name)
{
struct ftrace_event_field *field;
@@ -114,7 +114,8 @@ trace_find_event_field(struct trace_event_call *call, char *name)
static int __trace_define_field(struct list_head *head, const char *type,
const char *name, int offset, int size,
- int is_signed, int filter_type, int len)
+ int is_signed, int filter_type, int len,
+ int need_test)
{
struct ftrace_event_field *field;
@@ -133,6 +134,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
field->offset = offset;
field->size = size;
field->is_signed = is_signed;
+ field->needs_test = need_test;
field->len = len;
list_add(&field->link, head);
@@ -151,13 +153,13 @@ int trace_define_field(struct trace_event_call *call, const char *type,
head = trace_get_fields(call);
return __trace_define_field(head, type, name, offset, size,
- is_signed, filter_type, 0);
+ is_signed, filter_type, 0, 0);
}
EXPORT_SYMBOL_GPL(trace_define_field);
static int trace_define_field_ext(struct trace_event_call *call, const char *type,
const char *name, int offset, int size, int is_signed,
- int filter_type, int len)
+ int filter_type, int len, int need_test)
{
struct list_head *head;
@@ -166,13 +168,13 @@ static int trace_define_field_ext(struct trace_event_call *call, const char *typ
head = trace_get_fields(call);
return __trace_define_field(head, type, name, offset, size,
- is_signed, filter_type, len);
+ is_signed, filter_type, len, need_test);
}
#define __generic_field(type, item, filter_type) \
ret = __trace_define_field(&ftrace_generic_fields, #type, \
#item, 0, 0, is_signed_type(type), \
- filter_type, 0); \
+ filter_type, 0, 0); \
if (ret) \
return ret;
@@ -181,7 +183,8 @@ static int trace_define_field_ext(struct trace_event_call *call, const char *typ
"common_" #item, \
offsetof(typeof(ent), item), \
sizeof(ent.item), \
- is_signed_type(type), FILTER_OTHER, 0); \
+ is_signed_type(type), FILTER_OTHER, \
+ 0, 0); \
if (ret) \
return ret;
@@ -244,19 +247,16 @@ int trace_event_get_offsets(struct trace_event_call *call)
return tail->offset + tail->size;
}
-/*
- * Check if the referenced field is an array and return true,
- * as arrays are OK to dereference.
- */
-static bool test_field(const char *fmt, struct trace_event_call *call)
+
+static struct trace_event_fields *find_event_field(const char *fmt,
+ struct trace_event_call *call)
{
struct trace_event_fields *field = call->class->fields_array;
- const char *array_descriptor;
const char *p = fmt;
int len;
if (!(len = str_has_prefix(fmt, "REC->")))
- return false;
+ return NULL;
fmt += len;
for (p = fmt; *p; p++) {
if (!isalnum(*p) && *p != '_')
@@ -265,16 +265,141 @@ static bool test_field(const char *fmt, struct trace_event_call *call)
len = p - fmt;
for (; field->type; field++) {
- if (strncmp(field->name, fmt, len) ||
- field->name[len])
+ if (strncmp(field->name, fmt, len) || field->name[len])
continue;
- array_descriptor = strchr(field->type, '[');
- /* This is an array and is OK to dereference. */
- return array_descriptor != NULL;
+
+ return field;
+ }
+ return NULL;
+}
+
+/*
+ * Check if the referenced field is an array and return true,
+ * as arrays are OK to dereference.
+ */
+static bool test_field(const char *fmt, struct trace_event_call *call)
+{
+ struct trace_event_fields *field;
+
+ field = find_event_field(fmt, call);
+ if (!field)
+ return false;
+
+ /* This is an array and is OK to dereference. */
+ return strchr(field->type, '[') != NULL;
+}
+
+/* Look for a string within an argument */
+static bool find_print_string(const char *arg, const char *str, const char *end)
+{
+ const char *r;
+
+ r = strstr(arg, str);
+ return r && r < end;
+}
+
+/* Return true if the argument pointer is safe */
+static bool process_pointer(const char *fmt, int len, struct trace_event_call *call)
+{
+ const char *r, *e, *a;
+
+ e = fmt + len;
+
+ /* Find the REC-> in the argument */
+ r = strstr(fmt, "REC->");
+ if (r && r < e) {
+ /*
+ * Addresses of events on the buffer, or an array on the buffer is
+ * OK to dereference. There's ways to fool this, but
+ * this is to catch common mistakes, not malicious code.
+ */
+ a = strchr(fmt, '&');
+ if ((a && (a < r)) || test_field(r, call))
+ return true;
+ } else if (find_print_string(fmt, "__get_dynamic_array(", e)) {
+ return true;
+ } else if (find_print_string(fmt, "__get_rel_dynamic_array(", e)) {
+ return true;
+ } else if (find_print_string(fmt, "__get_dynamic_array_len(", e)) {
+ return true;
+ } else if (find_print_string(fmt, "__get_rel_dynamic_array_len(", e)) {
+ return true;
+ } else if (find_print_string(fmt, "__get_sockaddr(", e)) {
+ return true;
+ } else if (find_print_string(fmt, "__get_rel_sockaddr(", e)) {
+ return true;
}
return false;
}
+/* Return true if the string is safe */
+static bool process_string(const char *fmt, int len, struct trace_event_call *call)
+{
+ struct trace_event_fields *field;
+ const char *r, *e, *s;
+
+ e = fmt + len;
+
+ /*
+ * There are several helper functions that return strings.
+ * If the argument contains a function, then assume its field is valid.
+ * It is considered that the argument has a function if it has:
+ * alphanumeric or '_' before a parenthesis.
+ */
+ s = fmt;
+ do {
+ r = strstr(s, "(");
+ if (!r || r >= e)
+ break;
+ for (int i = 1; r - i >= s; i++) {
+ char ch = *(r - i);
+ if (isspace(ch))
+ continue;
+ if (isalnum(ch) || ch == '_')
+ return true;
+ /* Anything else, this isn't a function */
+ break;
+ }
+ /* A function could be wrapped in parethesis, try the next one */
+ s = r + 1;
+ } while (s < e);
+
+ /*
+ * Check for arrays. If the argument has: foo[REC->val]
+ * then it is very likely that foo is an array of strings
+ * that are safe to use.
+ */
+ r = strstr(s, "[");
+ if (r && r < e) {
+ r = strstr(r, "REC->");
+ if (r && r < e)
+ return true;
+ }
+
+ /*
+ * If there's any strings in the argument consider this arg OK as it
+ * could be: REC->field ? "foo" : "bar" and we don't want to get into
+ * verifying that logic here.
+ */
+ if (find_print_string(fmt, "\"", e))
+ return true;
+
+ /* Dereferenced strings are also valid like any other pointer */
+ if (process_pointer(fmt, len, call))
+ return true;
+
+ /* Make sure the field is found */
+ field = find_event_field(fmt, call);
+ if (!field)
+ return false;
+
+ /* Test this field's string before printing the event */
+ call->flags |= TRACE_EVENT_FL_TEST_STR;
+ field->needs_test = 1;
+
+ return true;
+}
+
/*
* Examine the print fmt of the event looking for unsafe dereference
* pointers using %p* that could be recorded in the trace event and
@@ -284,13 +409,14 @@ static bool test_field(const char *fmt, struct trace_event_call *call)
static void test_event_printk(struct trace_event_call *call)
{
u64 dereference_flags = 0;
+ u64 string_flags = 0;
bool first = true;
- const char *fmt, *c, *r, *a;
+ const char *fmt;
int parens = 0;
char in_quote = 0;
int start_arg = 0;
int arg = 0;
- int i;
+ int i, e;
fmt = call->print_fmt;
@@ -374,8 +500,16 @@ static void test_event_printk(struct trace_event_call *call)
star = true;
continue;
}
- if ((fmt[i + j] == 's') && star)
- arg++;
+ if ((fmt[i + j] == 's')) {
+ if (star)
+ arg++;
+ if (WARN_ONCE(arg == 63,
+ "Too many args for event: %s",
+ trace_event_name(call)))
+ return;
+ dereference_flags |= 1ULL << arg;
+ string_flags |= 1ULL << arg;
+ }
break;
}
break;
@@ -403,42 +537,47 @@ static void test_event_printk(struct trace_event_call *call)
case ',':
if (in_quote || parens)
continue;
+ e = i;
i++;
while (isspace(fmt[i]))
i++;
- start_arg = i;
- if (!(dereference_flags & (1ULL << arg)))
- goto next_arg;
- /* Find the REC-> in the argument */
- c = strchr(fmt + i, ',');
- r = strstr(fmt + i, "REC->");
- if (r && (!c || r < c)) {
- /*
- * Addresses of events on the buffer,
- * or an array on the buffer is
- * OK to dereference.
- * There's ways to fool this, but
- * this is to catch common mistakes,
- * not malicious code.
- */
- a = strchr(fmt + i, '&');
- if ((a && (a < r)) || test_field(r, call))
+ /*
+ * If start_arg is zero, then this is the start of the
+ * first argument. The processing of the argument happens
+ * when the end of the argument is found, as it needs to
+ * handle paranthesis and such.
+ */
+ if (!start_arg) {
+ start_arg = i;
+ /* Balance out the i++ in the for loop */
+ i--;
+ continue;
+ }
+
+ if (dereference_flags & (1ULL << arg)) {
+ if (string_flags & (1ULL << arg)) {
+ if (process_string(fmt + start_arg, e - start_arg, call))
+ dereference_flags &= ~(1ULL << arg);
+ } else if (process_pointer(fmt + start_arg, e - start_arg, call))
dereference_flags &= ~(1ULL << arg);
- } else if ((r = strstr(fmt + i, "__get_dynamic_array(")) &&
- (!c || r < c)) {
- dereference_flags &= ~(1ULL << arg);
- } else if ((r = strstr(fmt + i, "__get_sockaddr(")) &&
- (!c || r < c)) {
- dereference_flags &= ~(1ULL << arg);
}
- next_arg:
- i--;
+ start_arg = i;
arg++;
+ /* Balance out the i++ in the for loop */
+ i--;
}
}
+ if (dereference_flags & (1ULL << arg)) {
+ if (string_flags & (1ULL << arg)) {
+ if (process_string(fmt + start_arg, i - start_arg, call))
+ dereference_flags &= ~(1ULL << arg);
+ } else if (process_pointer(fmt + start_arg, i - start_arg, call))
+ dereference_flags &= ~(1ULL << arg);
+ }
+
/*
* If you triggered the below warning, the trace event reported
* uses an unsafe dereference pointer %p*. As the data stored
@@ -730,6 +869,120 @@ static int ftrace_event_enable_disable(struct trace_event_file *file,
return __ftrace_event_enable_disable(file, enable, 0);
}
+#ifdef CONFIG_MODULES
+struct event_mod_load {
+ struct list_head list;
+ char *module;
+ char *match;
+ char *system;
+ char *event;
+};
+
+static void free_event_mod(struct event_mod_load *event_mod)
+{
+ list_del(&event_mod->list);
+ kfree(event_mod->module);
+ kfree(event_mod->match);
+ kfree(event_mod->system);
+ kfree(event_mod->event);
+ kfree(event_mod);
+}
+
+static void clear_mod_events(struct trace_array *tr)
+{
+ struct event_mod_load *event_mod, *n;
+
+ list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) {
+ free_event_mod(event_mod);
+ }
+}
+
+static int remove_cache_mod(struct trace_array *tr, const char *mod,
+ const char *match, const char *system, const char *event)
+{
+ struct event_mod_load *event_mod, *n;
+ int ret = -EINVAL;
+
+ list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) {
+ if (strcmp(event_mod->module, mod) != 0)
+ continue;
+
+ if (match && strcmp(event_mod->match, match) != 0)
+ continue;
+
+ if (system &&
+ (!event_mod->system || strcmp(event_mod->system, system) != 0))
+ continue;
+
+ if (event &&
+ (!event_mod->event || strcmp(event_mod->event, event) != 0))
+ continue;
+
+ free_event_mod(event_mod);
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static int cache_mod(struct trace_array *tr, const char *mod, int set,
+ const char *match, const char *system, const char *event)
+{
+ struct event_mod_load *event_mod;
+
+ /* If the module exists, then this just failed to find an event */
+ if (module_exists(mod))
+ return -EINVAL;
+
+ /* See if this is to remove a cached filter */
+ if (!set)
+ return remove_cache_mod(tr, mod, match, system, event);
+
+ event_mod = kzalloc(sizeof(*event_mod), GFP_KERNEL);
+ if (!event_mod)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&event_mod->list);
+ event_mod->module = kstrdup(mod, GFP_KERNEL);
+ if (!event_mod->module)
+ goto out_free;
+
+ if (match) {
+ event_mod->match = kstrdup(match, GFP_KERNEL);
+ if (!event_mod->match)
+ goto out_free;
+ }
+
+ if (system) {
+ event_mod->system = kstrdup(system, GFP_KERNEL);
+ if (!event_mod->system)
+ goto out_free;
+ }
+
+ if (event) {
+ event_mod->event = kstrdup(event, GFP_KERNEL);
+ if (!event_mod->event)
+ goto out_free;
+ }
+
+ list_add(&event_mod->list, &tr->mod_events);
+
+ return 0;
+
+ out_free:
+ free_event_mod(event_mod);
+
+ return -ENOMEM;
+}
+#else /* CONFIG_MODULES */
+static inline void clear_mod_events(struct trace_array *tr) { }
+static int cache_mod(struct trace_array *tr, const char *mod, int set,
+ const char *match, const char *system, const char *event)
+{
+ return -EINVAL;
+}
+#endif
+
static void ftrace_clear_events(struct trace_array *tr)
{
struct trace_event_file *file;
@@ -738,6 +991,7 @@ static void ftrace_clear_events(struct trace_array *tr)
list_for_each_entry(file, &tr->events, list) {
ftrace_event_enable_disable(file, 0);
}
+ clear_mod_events(tr);
mutex_unlock(&event_mutex);
}
@@ -992,18 +1246,18 @@ static void remove_subsystem(struct trace_subsystem_dir *dir)
void event_file_get(struct trace_event_file *file)
{
- atomic_inc(&file->ref);
+ refcount_inc(&file->ref);
}
void event_file_put(struct trace_event_file *file)
{
- if (WARN_ON_ONCE(!atomic_read(&file->ref))) {
+ if (WARN_ON_ONCE(!refcount_read(&file->ref))) {
if (file->flags & EVENT_FILE_FL_FREED)
kmem_cache_free(file_cachep, file);
return;
}
- if (atomic_dec_and_test(&file->ref)) {
+ if (refcount_dec_and_test(&file->ref)) {
/* Count should only go to zero when it is freed */
if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED)))
return;
@@ -1026,17 +1280,36 @@ static void remove_event_file_dir(struct trace_event_file *file)
*/
static int
__ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
- const char *sub, const char *event, int set)
+ const char *sub, const char *event, int set,
+ const char *mod)
{
struct trace_event_file *file;
struct trace_event_call *call;
+ char *module __free(kfree) = NULL;
const char *name;
int ret = -EINVAL;
int eret = 0;
+ if (mod) {
+ char *p;
+
+ module = kstrdup(mod, GFP_KERNEL);
+ if (!module)
+ return -ENOMEM;
+
+ /* Replace all '-' with '_' as that's what modules do */
+ for (p = strchr(module, '-'); p; p = strchr(p + 1, '-'))
+ *p = '_';
+ }
+
list_for_each_entry(file, &tr->events, list) {
call = file->event_call;
+
+ /* If a module is specified, skip events that are not that module */
+ if (module && (!call->module || strcmp(module_name(call->module), module)))
+ continue;
+
name = trace_event_name(call);
if (!name || !call->class || !call->class->reg)
@@ -1069,16 +1342,24 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
ret = eret;
}
+ /*
+ * If this is a module setting and nothing was found,
+ * check if the module was loaded. If it wasn't cache it.
+ */
+ if (module && ret == -EINVAL && !eret)
+ ret = cache_mod(tr, module, set, match, sub, event);
+
return ret;
}
static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
- const char *sub, const char *event, int set)
+ const char *sub, const char *event, int set,
+ const char *mod)
{
int ret;
mutex_lock(&event_mutex);
- ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set);
+ ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set, mod);
mutex_unlock(&event_mutex);
return ret;
@@ -1086,11 +1367,20 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
{
- char *event = NULL, *sub = NULL, *match;
+ char *event = NULL, *sub = NULL, *match, *mod;
int ret;
if (!tr)
return -ENOENT;
+
+ /* Modules events can be appened with :mod:<module> */
+ mod = strstr(buf, ":mod:");
+ if (mod) {
+ *mod = '\0';
+ /* move to the module name */
+ mod += 5;
+ }
+
/*
* The buf format can be <subsystem>:<event-name>
* *:<event-name> means any event by that name.
@@ -1113,9 +1403,13 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
sub = NULL;
if (!strlen(event) || strcmp(event, "*") == 0)
event = NULL;
+ } else if (mod) {
+ /* Allow wildcard for no length or star */
+ if (!strlen(match) || strcmp(match, "*") == 0)
+ match = NULL;
}
- ret = __ftrace_set_clr_event(tr, match, sub, event, set);
+ ret = __ftrace_set_clr_event(tr, match, sub, event, set, mod);
/* Put back the colon to allow this to be called again */
if (buf)
@@ -1143,7 +1437,7 @@ int trace_set_clr_event(const char *system, const char *event, int set)
if (!tr)
return -ENODEV;
- return __ftrace_set_clr_event(tr, NULL, system, event, set);
+ return __ftrace_set_clr_event(tr, NULL, system, event, set, NULL);
}
EXPORT_SYMBOL_GPL(trace_set_clr_event);
@@ -1169,7 +1463,7 @@ int trace_array_set_clr_event(struct trace_array *tr, const char *system,
return -ENOENT;
set = (enable == true) ? 1 : 0;
- return __ftrace_set_clr_event(tr, NULL, system, event, set);
+ return __ftrace_set_clr_event(tr, NULL, system, event, set, NULL);
}
EXPORT_SYMBOL_GPL(trace_array_set_clr_event);
@@ -1256,37 +1550,78 @@ static void *t_start(struct seq_file *m, loff_t *pos)
return file;
}
+enum set_event_iter_type {
+ SET_EVENT_FILE,
+ SET_EVENT_MOD,
+};
+
+struct set_event_iter {
+ enum set_event_iter_type type;
+ union {
+ struct trace_event_file *file;
+ struct event_mod_load *event_mod;
+ };
+};
+
static void *
s_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct trace_event_file *file = v;
+ struct set_event_iter *iter = v;
+ struct trace_event_file *file;
struct trace_array *tr = m->private;
(*pos)++;
- list_for_each_entry_continue(file, &tr->events, list) {
- if (file->flags & EVENT_FILE_FL_ENABLED)
- return file;
+ if (iter->type == SET_EVENT_FILE) {
+ file = iter->file;
+ list_for_each_entry_continue(file, &tr->events, list) {
+ if (file->flags & EVENT_FILE_FL_ENABLED) {
+ iter->file = file;
+ return iter;
+ }
+ }
+#ifdef CONFIG_MODULES
+ iter->type = SET_EVENT_MOD;
+ iter->event_mod = list_entry(&tr->mod_events, struct event_mod_load, list);
+#endif
}
+#ifdef CONFIG_MODULES
+ list_for_each_entry_continue(iter->event_mod, &tr->mod_events, list)
+ return iter;
+#endif
+
+ /*
+ * The iter is allocated in s_start() and passed via the 'v'
+ * parameter. To stop the iterator, NULL must be returned. But
+ * the return value is what the 'v' parameter in s_stop() receives
+ * and frees. Free iter here as it will no longer be used.
+ */
+ kfree(iter);
return NULL;
}
static void *s_start(struct seq_file *m, loff_t *pos)
{
- struct trace_event_file *file;
struct trace_array *tr = m->private;
+ struct set_event_iter *iter;
loff_t l;
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return NULL;
+
mutex_lock(&event_mutex);
- file = list_entry(&tr->events, struct trace_event_file, list);
+ iter->type = SET_EVENT_FILE;
+ iter->file = list_entry(&tr->events, struct trace_event_file, list);
+
for (l = 0; l <= *pos; ) {
- file = s_next(m, file, &l);
- if (!file)
+ iter = s_next(m, iter, &l);
+ if (!iter)
break;
}
- return file;
+ return iter;
}
static int t_show(struct seq_file *m, void *v)
@@ -1306,6 +1641,45 @@ static void t_stop(struct seq_file *m, void *p)
mutex_unlock(&event_mutex);
}
+#ifdef CONFIG_MODULES
+static int s_show(struct seq_file *m, void *v)
+{
+ struct set_event_iter *iter = v;
+ const char *system;
+ const char *event;
+
+ if (iter->type == SET_EVENT_FILE)
+ return t_show(m, iter->file);
+
+ /* When match is set, system and event are not */
+ if (iter->event_mod->match) {
+ seq_printf(m, "%s:mod:%s\n", iter->event_mod->match,
+ iter->event_mod->module);
+ return 0;
+ }
+
+ system = iter->event_mod->system ? : "*";
+ event = iter->event_mod->event ? : "*";
+
+ seq_printf(m, "%s:%s:mod:%s\n", system, event, iter->event_mod->module);
+
+ return 0;
+}
+#else /* CONFIG_MODULES */
+static int s_show(struct seq_file *m, void *v)
+{
+ struct set_event_iter *iter = v;
+
+ return t_show(m, iter->file);
+}
+#endif
+
+static void s_stop(struct seq_file *m, void *v)
+{
+ kfree(v);
+ t_stop(m, NULL);
+}
+
static void *
__next(struct seq_file *m, void *v, loff_t *pos, int type)
{
@@ -1386,12 +1760,12 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
char buf[4] = "0";
mutex_lock(&event_mutex);
- file = event_file_data(filp);
+ file = event_file_file(filp);
if (likely(file))
flags = file->flags;
mutex_unlock(&event_mutex);
- if (!file || flags & EVENT_FILE_FL_FREED)
+ if (!file)
return -ENODEV;
if (flags & EVENT_FILE_FL_ENABLED &&
@@ -1419,21 +1793,20 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (ret)
return ret;
+ guard(mutex)(&event_mutex);
+
switch (val) {
case 0:
case 1:
- ret = -ENODEV;
- mutex_lock(&event_mutex);
- file = event_file_data(filp);
- if (likely(file && !(file->flags & EVENT_FILE_FL_FREED))) {
- ret = tracing_update_buffers(file->tr);
- if (ret < 0) {
- mutex_unlock(&event_mutex);
- return ret;
- }
- ret = ftrace_event_enable_disable(file, val);
- }
- mutex_unlock(&event_mutex);
+ file = event_file_file(filp);
+ if (!file)
+ return -ENODEV;
+ ret = tracing_update_buffers(file->tr);
+ if (ret < 0)
+ return ret;
+ ret = ftrace_event_enable_disable(file, val);
+ if (ret < 0)
+ return ret;
break;
default:
@@ -1442,7 +1815,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
*ppos += cnt;
- return ret ? ret : cnt;
+ return cnt;
}
static ssize_t
@@ -1520,7 +1893,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (system)
name = system->name;
- ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val);
+ ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val, NULL);
if (ret)
goto out;
@@ -1540,7 +1913,8 @@ enum {
static void *f_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct trace_event_call *call = event_file_data(m->private);
+ struct trace_event_file *file = event_file_data(m->private);
+ struct trace_event_call *call = file->event_call;
struct list_head *common_head = &ftrace_common_fields;
struct list_head *head = trace_get_fields(call);
struct list_head *node = v;
@@ -1572,7 +1946,8 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos)
static int f_show(struct seq_file *m, void *v)
{
- struct trace_event_call *call = event_file_data(m->private);
+ struct trace_event_file *file = event_file_data(m->private);
+ struct trace_event_call *call = file->event_call;
struct ftrace_event_field *field;
const char *array_descriptor;
@@ -1627,12 +2002,14 @@ static int f_show(struct seq_file *m, void *v)
static void *f_start(struct seq_file *m, loff_t *pos)
{
+ struct trace_event_file *file;
void *p = (void *)FORMAT_HEADER;
loff_t l = 0;
/* ->stop() is called even if ->start() fails */
mutex_lock(&event_mutex);
- if (!event_file_data(m->private))
+ file = event_file_file(m->private);
+ if (!file)
return ERR_PTR(-ENODEV);
while (l < *pos && p)
@@ -1670,6 +2047,7 @@ static int trace_format_open(struct inode *inode, struct file *file)
return 0;
}
+#ifdef CONFIG_PERF_EVENTS
static ssize_t
event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
{
@@ -1684,6 +2062,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
}
+#endif
static ssize_t
event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
@@ -1704,8 +2083,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
trace_seq_init(s);
mutex_lock(&event_mutex);
- file = event_file_data(filp);
- if (file && !(file->flags & EVENT_FILE_FL_FREED))
+ file = event_file_file(filp);
+ if (file)
print_event_filter(file, s);
mutex_unlock(&event_mutex);
@@ -1734,9 +2113,13 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
return PTR_ERR(buf);
mutex_lock(&event_mutex);
- file = event_file_data(filp);
- if (file)
- err = apply_event_filter(file, buf);
+ file = event_file_file(filp);
+ if (file) {
+ if (file->flags & EVENT_FILE_FL_FREED)
+ err = -ENODEV;
+ else
+ err = apply_event_filter(file, buf);
+ }
mutex_unlock(&event_mutex);
kfree(buf);
@@ -2008,7 +2391,7 @@ event_pid_write(struct file *filp, const char __user *ubuf,
if (ret < 0)
return ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
if (type == TRACE_PIDS) {
filtered_pids = rcu_dereference_protected(tr->filtered_pids,
@@ -2024,7 +2407,7 @@ event_pid_write(struct file *filp, const char __user *ubuf,
ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
if (ret < 0)
- goto out;
+ return ret;
if (type == TRACE_PIDS)
rcu_assign_pointer(tr->filtered_pids, pid_list);
@@ -2049,11 +2432,7 @@ event_pid_write(struct file *filp, const char __user *ubuf,
*/
on_each_cpu(ignore_task_cpu, tr, 1);
- out:
- mutex_unlock(&event_mutex);
-
- if (ret > 0)
- *ppos += ret;
+ *ppos += ret;
return ret;
}
@@ -2088,8 +2467,8 @@ static const struct seq_operations show_event_seq_ops = {
static const struct seq_operations show_set_event_seq_ops = {
.start = s_start,
.next = s_next,
- .show = t_show,
- .stop = t_stop,
+ .show = s_show,
+ .stop = s_stop,
};
static const struct seq_operations show_set_pid_seq_ops = {
@@ -2152,10 +2531,12 @@ static const struct file_operations ftrace_event_format_fops = {
.release = seq_release,
};
+#ifdef CONFIG_PERF_EVENTS
static const struct file_operations ftrace_event_id_fops = {
.read = event_id_read,
.llseek = default_llseek,
};
+#endif
static const struct file_operations ftrace_event_filter_fops = {
.open = tracing_open_file_tr,
@@ -2459,7 +2840,7 @@ event_define_fields(struct trace_event_call *call)
ret = trace_define_field_ext(call, field->type, field->name,
offset, field->size,
field->is_signed, field->filter_type,
- field->len);
+ field->len, field->needs_test);
if (WARN_ON_ONCE(ret)) {
pr_err("error code is %d\n", ret);
break;
@@ -2481,7 +2862,6 @@ static int event_callback(const char *name, umode_t *mode, void **data,
if (strcmp(name, "format") == 0) {
*mode = TRACE_MODE_READ;
*fops = &ftrace_event_format_fops;
- *data = call;
return 1;
}
@@ -2548,6 +2928,14 @@ static int event_callback(const char *name, umode_t *mode, void **data,
return 0;
}
+/* The file is incremented on creation and freeing the enable file decrements it */
+static void event_release(const char *name, void *data)
+{
+ struct trace_event_file *file = data;
+
+ event_file_put(file);
+}
+
static int
event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
{
@@ -2562,6 +2950,7 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
{
.name = "enable",
.callback = event_callback,
+ .release = event_release,
},
{
.name = "filter",
@@ -2630,6 +3019,9 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
return ret;
}
+ /* Gets decremented on freeing of the "enable" file */
+ event_file_get(file);
+
return 0;
}
@@ -2949,6 +3341,20 @@ static bool event_in_systems(struct trace_event_call *call,
return !*p || isspace(*p) || *p == ',';
}
+#ifdef CONFIG_HIST_TRIGGERS
+/*
+ * Wake up waiter on the hist_poll_wq from irq_work because the hist trigger
+ * may happen in any context.
+ */
+static void hist_poll_event_irq_work(struct irq_work *work)
+{
+ wake_up_all(&hist_poll_wq);
+}
+
+DEFINE_IRQ_WORK(hist_poll_work, hist_poll_event_irq_work);
+DECLARE_WAIT_QUEUE_HEAD(hist_poll_wq);
+#endif
+
static struct trace_event_file *
trace_create_new_event(struct trace_event_call *call,
struct trace_array *tr)
@@ -2980,7 +3386,7 @@ trace_create_new_event(struct trace_event_call *call,
atomic_set(&file->tm_ref, 0);
INIT_LIST_HEAD(&file->triggers);
list_add(&file->list, &tr->events);
- event_file_get(file);
+ refcount_set(&file->ref, 1);
return file;
}
@@ -3107,13 +3513,13 @@ int trace_add_event_call(struct trace_event_call *call)
int ret;
lockdep_assert_held(&event_mutex);
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
ret = __register_event(call, NULL);
- if (ret >= 0)
- __add_event_to_tracers(call);
+ if (ret < 0)
+ return ret;
- mutex_unlock(&trace_types_lock);
+ __add_event_to_tracers(call);
return ret;
}
EXPORT_SYMBOL_GPL(trace_add_event_call);
@@ -3126,8 +3532,6 @@ static void __trace_remove_event_call(struct trace_event_call *call)
{
event_remove(call);
trace_destroy_fields(call);
- free_event_filter(call->filter);
- call->filter = NULL;
}
static int probe_remove_event_call(struct trace_event_call *call)
@@ -3195,6 +3599,28 @@ EXPORT_SYMBOL_GPL(trace_remove_event_call);
event++)
#ifdef CONFIG_MODULES
+static void update_mod_cache(struct trace_array *tr, struct module *mod)
+{
+ struct event_mod_load *event_mod, *n;
+
+ list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) {
+ if (strcmp(event_mod->module, mod->name) != 0)
+ continue;
+
+ __ftrace_set_clr_event_nolock(tr, event_mod->match,
+ event_mod->system,
+ event_mod->event, 1, mod->name);
+ free_event_mod(event_mod);
+ }
+}
+
+static void update_cache_events(struct module *mod)
+{
+ struct trace_array *tr;
+
+ list_for_each_entry(tr, &ftrace_trace_arrays, list)
+ update_mod_cache(tr, mod);
+}
static void trace_module_add_events(struct module *mod)
{
@@ -3217,6 +3643,8 @@ static void trace_module_add_events(struct module *mod)
__register_event(*call, mod);
__add_event_to_tracers(*call);
}
+
+ update_cache_events(mod);
}
static void trace_module_remove_events(struct module *mod)
@@ -3369,30 +3797,21 @@ struct trace_event_file *trace_get_event_file(const char *instance,
return ERR_PTR(ret);
}
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
file = find_event_file(tr, system, event);
if (!file) {
trace_array_put(tr);
- ret = -EINVAL;
- goto out;
+ return ERR_PTR(-EINVAL);
}
/* Don't let event modules unload while in use */
ret = trace_event_try_get_ref(file->event_call);
if (!ret) {
trace_array_put(tr);
- ret = -EBUSY;
- goto out;
+ return ERR_PTR(-EBUSY);
}
- ret = 0;
- out:
- mutex_unlock(&event_mutex);
-
- if (ret)
- file = ERR_PTR(ret);
-
return file;
}
EXPORT_SYMBOL_GPL(trace_get_event_file);
@@ -3610,6 +4029,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash,
struct trace_event_file *file;
struct ftrace_probe_ops *ops;
struct event_probe_data *data;
+ unsigned long count = -1;
const char *system;
const char *event;
char *number;
@@ -3629,12 +4049,11 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash,
event = strsep(&param, ":");
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
- ret = -EINVAL;
file = find_event_file(tr, system, event);
if (!file)
- goto out;
+ return -EINVAL;
enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
@@ -3643,74 +4062,62 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash,
else
ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops;
- if (glob[0] == '!') {
- ret = unregister_ftrace_function_probe_func(glob+1, tr, ops);
- goto out;
- }
+ if (glob[0] == '!')
+ return unregister_ftrace_function_probe_func(glob+1, tr, ops);
- ret = -ENOMEM;
+ if (param) {
+ number = strsep(&param, ":");
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data)
- goto out;
-
- data->enable = enable;
- data->count = -1;
- data->file = file;
-
- if (!param)
- goto out_reg;
-
- number = strsep(&param, ":");
-
- ret = -EINVAL;
- if (!strlen(number))
- goto out_free;
+ if (!strlen(number))
+ return -EINVAL;
- /*
- * We use the callback data field (which is a pointer)
- * as our counter.
- */
- ret = kstrtoul(number, 0, &data->count);
- if (ret)
- goto out_free;
+ /*
+ * We use the callback data field (which is a pointer)
+ * as our counter.
+ */
+ ret = kstrtoul(number, 0, &count);
+ if (ret)
+ return ret;
+ }
- out_reg:
/* Don't let event modules unload while probe registered */
ret = trace_event_try_get_ref(file->event_call);
- if (!ret) {
- ret = -EBUSY;
- goto out_free;
- }
+ if (!ret)
+ return -EBUSY;
ret = __ftrace_event_enable_disable(file, 1, 1);
if (ret < 0)
goto out_put;
+ ret = -ENOMEM;
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ goto out_put;
+
+ data->enable = enable;
+ data->count = count;
+ data->file = file;
+
ret = register_ftrace_function_probe(glob, tr, ops, data);
/*
* The above returns on success the # of functions enabled,
* but if it didn't find any functions it returns zero.
* Consider no functions a failure too.
*/
- if (!ret) {
- ret = -ENOENT;
- goto out_disable;
- } else if (ret < 0)
- goto out_disable;
+
/* Just return zero, not the number of enabled functions */
- ret = 0;
- out:
- mutex_unlock(&event_mutex);
- return ret;
+ if (ret > 0)
+ return 0;
+
+ kfree(data);
+
+ if (!ret)
+ ret = -ENOENT;
- out_disable:
__ftrace_event_enable_disable(file, 0, 1);
out_put:
trace_event_put_ref(file->event_call);
- out_free:
- kfree(data);
- goto out;
+ return ret;
}
static struct ftrace_func_command event_enable_cmd = {
@@ -3933,20 +4340,17 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
{
int ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
ret = create_event_toplevel_files(parent, tr);
if (ret)
- goto out_unlock;
+ return ret;
down_write(&trace_event_sem);
__trace_early_add_event_dirs(tr);
up_write(&trace_event_sem);
- out_unlock:
- mutex_unlock(&event_mutex);
-
- return ret;
+ return 0;
}
/* Must be called with event_mutex held */
@@ -3961,7 +4365,7 @@ int event_trace_del_tracer(struct trace_array *tr)
__ftrace_clear_event_pids(tr, TRACE_PIDS | TRACE_NO_PIDS);
/* Disable any running events */
- __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
+ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0, NULL);
/* Make sure no more events are being executed */
tracepoint_synchronize_unregister();
@@ -4245,7 +4649,7 @@ static __init void event_trace_self_tests(void)
pr_info("Testing event system %s: ", system->name);
- ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);
+ ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1, NULL);
if (WARN_ON_ONCE(ret)) {
pr_warn("error enabling system %s\n",
system->name);
@@ -4254,7 +4658,7 @@ static __init void event_trace_self_tests(void)
event_test_stuff();
- ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);
+ ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0, NULL);
if (WARN_ON_ONCE(ret)) {
pr_warn("error disabling system %s\n",
system->name);
@@ -4269,7 +4673,7 @@ static __init void event_trace_self_tests(void)
pr_info("Running tests on all trace events:\n");
pr_info("Testing all events: ");
- ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);
+ ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1, NULL);
if (WARN_ON_ONCE(ret)) {
pr_warn("error enabling all events\n");
return;
@@ -4278,7 +4682,7 @@ static __init void event_trace_self_tests(void)
event_test_stuff();
/* reset sysname */
- ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
+ ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0, NULL);
if (WARN_ON_ONCE(ret)) {
pr_warn("error disabling all events\n");
return;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 0c611b281a5b..0993dfc1c5c1 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1616,7 +1616,7 @@ static int parse_pred(const char *str, void *data,
goto err_free;
}
- strncpy(num_buf, str + s, len);
+ memcpy(num_buf, str + s, len);
num_buf[len] = 0;
ret = kstrtoul(num_buf, 0, &ip);
@@ -1694,7 +1694,7 @@ static int parse_pred(const char *str, void *data,
if (!pred->regex)
goto err_mem;
pred->regex->len = len;
- strncpy(pred->regex->pattern, str + s, len);
+ memcpy(pred->regex->pattern, str + s, len);
pred->regex->pattern[len] = 0;
} else if (!strncmp(str + i, "CPUS", 4)) {
@@ -1859,7 +1859,7 @@ static int parse_pred(const char *str, void *data,
if (!pred->regex)
goto err_mem;
pred->regex->len = len;
- strncpy(pred->regex->pattern, str + s, len);
+ memcpy(pred->regex->pattern, str + s, len);
pred->regex->pattern[len] = 0;
filter_build_regex(pred);
@@ -1919,7 +1919,7 @@ static int parse_pred(const char *str, void *data,
goto err_free;
}
- strncpy(num_buf, str + s, len);
+ memcpy(num_buf, str + s, len);
num_buf[len] = 0;
/* Make sure it is a value */
@@ -2405,13 +2405,11 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
struct event_filter *filter = NULL;
int err = 0;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
/* Make sure the system still has events */
- if (!dir->nr_events) {
- err = -ENODEV;
- goto out_unlock;
- }
+ if (!dir->nr_events)
+ return -ENODEV;
if (!strcmp(strstrip(filter_string), "0")) {
filter_free_subsystem_preds(dir, tr);
@@ -2422,7 +2420,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
tracepoint_synchronize_unregister();
filter_free_subsystem_filters(dir, tr);
__free_filter(filter);
- goto out_unlock;
+ return 0;
}
err = create_system_filter(dir, filter_string, &filter);
@@ -2434,8 +2432,6 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
__free_filter(system->filter);
system->filter = filter;
}
-out_unlock:
- mutex_unlock(&event_mutex);
return err;
}
@@ -2612,17 +2608,15 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
struct event_filter *filter = NULL;
struct trace_event_call *call;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
call = event->tp_event;
- err = -EINVAL;
if (!call)
- goto out_unlock;
+ return -EINVAL;
- err = -EEXIST;
if (event->filter)
- goto out_unlock;
+ return -EEXIST;
err = create_filter(NULL, call, filter_str, false, &filter);
if (err)
@@ -2637,9 +2631,6 @@ free_filter:
if (err || ftrace_event_is_function(call))
__free_filter(filter);
-out_unlock:
- mutex_unlock(&event_mutex);
-
return err;
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 6ece1308d36a..53dc6719181e 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -822,7 +822,7 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
{
struct tracepoint *tp = event->tp;
- if (unlikely(atomic_read(&tp->key.enabled) > 0)) {
+ if (unlikely(static_key_enabled(&tp->key))) {
struct tracepoint_func *probe_func_ptr;
synth_probe_func_t probe_func;
void *__data;
@@ -1354,10 +1354,7 @@ static const char *hist_field_name(struct hist_field *field,
} else if (field->flags & HIST_FIELD_FL_TIMESTAMP)
field_name = "common_timestamp";
else if (field->flags & HIST_FIELD_FL_STACKTRACE) {
- if (field->field)
- field_name = field->field->name;
- else
- field_name = "common_stacktrace";
+ field_name = "common_stacktrace";
} else if (field->flags & HIST_FIELD_FL_HITCOUNT)
field_name = "hitcount";
@@ -1599,7 +1596,7 @@ static inline void save_comm(char *comm, struct task_struct *task)
return;
}
- strncpy(comm, task->comm, TASK_COMM_LEN);
+ strscpy(comm, task->comm, TASK_COMM_LEN);
}
static void hist_elt_data_free(struct hist_elt_data *elt_data)
@@ -3405,7 +3402,7 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data)
elt_data = context->elt->private_data;
track_elt_data = track_data->elt.private_data;
if (elt_data->comm)
- strncpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN);
+ strscpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN);
track_data->updated = true;
@@ -5314,6 +5311,8 @@ static void event_hist_trigger(struct event_trigger_data *data,
if (resolve_var_refs(hist_data, key, var_ref_vals, true))
hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals);
+
+ hist_poll_wakeup();
}
static void hist_trigger_stacktrace_print(struct seq_file *m,
@@ -5593,49 +5592,137 @@ static void hist_trigger_show(struct seq_file *m,
n_entries, (u64)atomic64_read(&hist_data->map->drops));
}
+struct hist_file_data {
+ struct file *file;
+ u64 last_read;
+ u64 last_act;
+};
+
+static u64 get_hist_hit_count(struct trace_event_file *event_file)
+{
+ struct hist_trigger_data *hist_data;
+ struct event_trigger_data *data;
+ u64 ret = 0;
+
+ list_for_each_entry(data, &event_file->triggers, list) {
+ if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ hist_data = data->private_data;
+ ret += atomic64_read(&hist_data->map->hits);
+ }
+ }
+ return ret;
+}
+
static int hist_show(struct seq_file *m, void *v)
{
+ struct hist_file_data *hist_file = m->private;
struct event_trigger_data *data;
struct trace_event_file *event_file;
- int n = 0, ret = 0;
+ int n = 0;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
- event_file = event_file_data(m->private);
- if (unlikely(!event_file)) {
- ret = -ENODEV;
- goto out_unlock;
- }
+ event_file = event_file_file(hist_file->file);
+ if (unlikely(!event_file))
+ return -ENODEV;
list_for_each_entry(data, &event_file->triggers, list) {
if (data->cmd_ops->trigger_type == ETT_EVENT_HIST)
hist_trigger_show(m, data, n++);
}
+ hist_file->last_read = get_hist_hit_count(event_file);
+ /*
+ * Update last_act too so that poll()/POLLPRI can wait for the next
+ * event after any syscall on hist file.
+ */
+ hist_file->last_act = hist_file->last_read;
- out_unlock:
- mutex_unlock(&event_mutex);
+ return 0;
+}
+
+static __poll_t event_hist_poll(struct file *file, struct poll_table_struct *wait)
+{
+ struct trace_event_file *event_file;
+ struct seq_file *m = file->private_data;
+ struct hist_file_data *hist_file = m->private;
+ __poll_t ret = 0;
+ u64 cnt;
+
+ guard(mutex)(&event_mutex);
+
+ event_file = event_file_data(file);
+ if (!event_file)
+ return EPOLLERR;
+
+ hist_poll_wait(file, wait);
+
+ cnt = get_hist_hit_count(event_file);
+ if (hist_file->last_read != cnt)
+ ret |= EPOLLIN | EPOLLRDNORM;
+ if (hist_file->last_act != cnt) {
+ hist_file->last_act = cnt;
+ ret |= EPOLLPRI;
+ }
return ret;
}
+static int event_hist_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m = file->private_data;
+ struct hist_file_data *hist_file = m->private;
+
+ kfree(hist_file);
+ return tracing_single_release_file_tr(inode, file);
+}
+
static int event_hist_open(struct inode *inode, struct file *file)
{
+ struct trace_event_file *event_file;
+ struct hist_file_data *hist_file;
int ret;
ret = tracing_open_file_tr(inode, file);
if (ret)
return ret;
+ guard(mutex)(&event_mutex);
+
+ event_file = event_file_data(file);
+ if (!event_file) {
+ ret = -ENODEV;
+ goto err;
+ }
+
+ hist_file = kzalloc(sizeof(*hist_file), GFP_KERNEL);
+ if (!hist_file) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ hist_file->file = file;
+ hist_file->last_act = get_hist_hit_count(event_file);
+
/* Clear private_data to avoid warning in single_open() */
file->private_data = NULL;
- return single_open(file, hist_show, file);
+ ret = single_open(file, hist_show, hist_file);
+ if (ret) {
+ kfree(hist_file);
+ goto err;
+ }
+
+ return 0;
+err:
+ tracing_release_file_tr(inode, file);
+ return ret;
}
const struct file_operations event_hist_fops = {
.open = event_hist_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = tracing_single_release_file_tr,
+ .release = event_hist_release,
+ .poll = event_hist_poll,
};
#ifdef CONFIG_HIST_TRIGGERS_DEBUG
@@ -5876,25 +5963,19 @@ static int hist_debug_show(struct seq_file *m, void *v)
{
struct event_trigger_data *data;
struct trace_event_file *event_file;
- int n = 0, ret = 0;
+ int n = 0;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
- event_file = event_file_data(m->private);
- if (unlikely(!event_file)) {
- ret = -ENODEV;
- goto out_unlock;
- }
+ event_file = event_file_file(m->private);
+ if (unlikely(!event_file))
+ return -ENODEV;
list_for_each_entry(data, &event_file->triggers, list) {
if (data->cmd_ops->trigger_type == ETT_EVENT_HIST)
hist_trigger_debug_show(m, data, n++);
}
-
- out_unlock:
- mutex_unlock(&event_mutex);
-
- return ret;
+ return 0;
}
static int event_hist_debug_open(struct inode *inode, struct file *file)
@@ -5907,7 +5988,10 @@ static int event_hist_debug_open(struct inode *inode, struct file *file)
/* Clear private_data to avoid warning in single_open() */
file->private_data = NULL;
- return single_open(file, hist_debug_show, file);
+ ret = single_open(file, hist_debug_show, file);
+ if (ret)
+ tracing_release_file_tr(inode, file);
+ return ret;
}
const struct file_operations event_hist_debug_fops = {
@@ -6652,27 +6736,27 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
if (existing_hist_update_only(glob, trigger_data, file))
goto out_free;
- ret = event_trigger_register(cmd_ops, file, glob, trigger_data);
- if (ret < 0)
- goto out_free;
+ if (!get_named_trigger_data(trigger_data)) {
- if (get_named_trigger_data(trigger_data))
- goto enable;
+ ret = create_actions(hist_data);
+ if (ret)
+ goto out_free;
- ret = create_actions(hist_data);
- if (ret)
- goto out_unreg;
+ if (has_hist_vars(hist_data) || hist_data->n_var_refs) {
+ ret = save_hist_vars(hist_data);
+ if (ret)
+ goto out_free;
+ }
- if (has_hist_vars(hist_data) || hist_data->n_var_refs) {
- ret = save_hist_vars(hist_data);
+ ret = tracing_map_init(hist_data->map);
if (ret)
- goto out_unreg;
+ goto out_free;
}
- ret = tracing_map_init(hist_data->map);
- if (ret)
- goto out_unreg;
-enable:
+ ret = event_trigger_register(cmd_ops, file, glob, trigger_data);
+ if (ret < 0)
+ goto out_free;
+
ret = hist_trigger_enable(trigger_data, file);
if (ret)
goto out_unreg;
diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c
index 8650562bdaa9..a8f076809db4 100644
--- a/kernel/trace/trace_events_inject.c
+++ b/kernel/trace/trace_events_inject.c
@@ -299,7 +299,7 @@ event_inject_write(struct file *filp, const char __user *ubuf, size_t cnt,
strim(buf);
mutex_lock(&event_mutex);
- file = event_file_data(filp);
+ file = event_file_file(filp);
if (file) {
call = file->event_call;
size = parse_entry(buf, call, &entry);
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index c82b401a294d..e3f7d09e5512 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -49,16 +49,11 @@ static char *last_cmd;
static int errpos(const char *str)
{
- int ret = 0;
-
- mutex_lock(&lastcmd_mutex);
+ guard(mutex)(&lastcmd_mutex);
if (!str || !last_cmd)
- goto out;
+ return 0;
- ret = err_pos(last_cmd, str);
- out:
- mutex_unlock(&lastcmd_mutex);
- return ret;
+ return err_pos(last_cmd, str);
}
static void last_cmd_set(const char *str)
@@ -74,14 +69,12 @@ static void last_cmd_set(const char *str)
static void synth_err(u8 err_type, u16 err_pos)
{
- mutex_lock(&lastcmd_mutex);
+ guard(mutex)(&lastcmd_mutex);
if (!last_cmd)
- goto out;
+ return;
tracing_log_err(NULL, "synthetic_events", last_cmd, err_text,
err_type, err_pos);
- out:
- mutex_unlock(&lastcmd_mutex);
}
static int create_synth_event(const char *raw_command);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index b33c3861fbbb..d45448947094 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -159,7 +159,7 @@ static void *trigger_start(struct seq_file *m, loff_t *pos)
/* ->stop() is called even if ->start() fails */
mutex_lock(&event_mutex);
- event_file = event_file_data(m->private);
+ event_file = event_file_file(m->private);
if (unlikely(!event_file))
return ERR_PTR(-ENODEV);
@@ -211,12 +211,10 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file)
if (ret)
return ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
- if (unlikely(!event_file_data(file))) {
- mutex_unlock(&event_mutex);
+ if (unlikely(!event_file_file(file)))
return -ENODEV;
- }
if ((file->f_mode & FMODE_WRITE) &&
(file->f_flags & O_TRUNC)) {
@@ -239,8 +237,6 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file)
}
}
- mutex_unlock(&event_mutex);
-
return ret;
}
@@ -248,7 +244,6 @@ int trigger_process_regex(struct trace_event_file *file, char *buff)
{
char *command, *next;
struct event_command *p;
- int ret = -EINVAL;
next = buff = skip_spaces(buff);
command = strsep(&next, ": \t");
@@ -259,17 +254,14 @@ int trigger_process_regex(struct trace_event_file *file, char *buff)
}
command = (command[0] != '!') ? command : command + 1;
- mutex_lock(&trigger_cmd_mutex);
+ guard(mutex)(&trigger_cmd_mutex);
+
list_for_each_entry(p, &trigger_commands, list) {
- if (strcmp(p->name, command) == 0) {
- ret = p->parse(p, file, buff, command, next);
- goto out_unlock;
- }
+ if (strcmp(p->name, command) == 0)
+ return p->parse(p, file, buff, command, next);
}
- out_unlock:
- mutex_unlock(&trigger_cmd_mutex);
- return ret;
+ return -EINVAL;
}
static ssize_t event_trigger_regex_write(struct file *file,
@@ -278,7 +270,7 @@ static ssize_t event_trigger_regex_write(struct file *file,
{
struct trace_event_file *event_file;
ssize_t ret;
- char *buf;
+ char *buf __free(kfree) = NULL;
if (!cnt)
return 0;
@@ -292,24 +284,18 @@ static ssize_t event_trigger_regex_write(struct file *file,
strim(buf);
- mutex_lock(&event_mutex);
- event_file = event_file_data(file);
- if (unlikely(!event_file)) {
- mutex_unlock(&event_mutex);
- kfree(buf);
+ guard(mutex)(&event_mutex);
+
+ event_file = event_file_file(file);
+ if (unlikely(!event_file))
return -ENODEV;
- }
- ret = trigger_process_regex(event_file, buf);
- mutex_unlock(&event_mutex);
- kfree(buf);
+ ret = trigger_process_regex(event_file, buf);
if (ret < 0)
- goto out;
+ return ret;
*ppos += cnt;
- ret = cnt;
- out:
- return ret;
+ return cnt;
}
static int event_trigger_regex_release(struct inode *inode, struct file *file)
@@ -359,20 +345,16 @@ const struct file_operations event_trigger_fops = {
__init int register_event_command(struct event_command *cmd)
{
struct event_command *p;
- int ret = 0;
- mutex_lock(&trigger_cmd_mutex);
+ guard(mutex)(&trigger_cmd_mutex);
+
list_for_each_entry(p, &trigger_commands, list) {
- if (strcmp(cmd->name, p->name) == 0) {
- ret = -EBUSY;
- goto out_unlock;
- }
+ if (strcmp(cmd->name, p->name) == 0)
+ return -EBUSY;
}
list_add(&cmd->list, &trigger_commands);
- out_unlock:
- mutex_unlock(&trigger_cmd_mutex);
- return ret;
+ return 0;
}
/*
@@ -382,20 +364,17 @@ __init int register_event_command(struct event_command *cmd)
__init int unregister_event_command(struct event_command *cmd)
{
struct event_command *p, *n;
- int ret = -ENODEV;
- mutex_lock(&trigger_cmd_mutex);
+ guard(mutex)(&trigger_cmd_mutex);
+
list_for_each_entry_safe(p, n, &trigger_commands, list) {
if (strcmp(cmd->name, p->name) == 0) {
- ret = 0;
list_del_init(&p->list);
- goto out_unlock;
+ return 0;
}
}
- out_unlock:
- mutex_unlock(&trigger_cmd_mutex);
- return ret;
+ return -ENODEV;
}
/**
@@ -597,20 +576,12 @@ out:
return ret;
}
-/**
- * unregister_trigger - Generic event_command @unreg implementation
- * @glob: The raw string used to register the trigger
- * @test: Trigger-specific data used to find the trigger to remove
- * @file: The trace_event_file associated with the event
- *
- * Common implementation for event trigger unregistration.
- *
- * Usually used directly as the @unreg method in event command
- * implementations.
+/*
+ * True if the trigger was found and unregistered, else false.
*/
-static void unregister_trigger(char *glob,
- struct event_trigger_data *test,
- struct trace_event_file *file)
+static bool try_unregister_trigger(char *glob,
+ struct event_trigger_data *test,
+ struct trace_event_file *file)
{
struct event_trigger_data *data = NULL, *iter;
@@ -626,8 +597,32 @@ static void unregister_trigger(char *glob,
}
}
- if (data && data->ops->free)
- data->ops->free(data);
+ if (data) {
+ if (data->ops->free)
+ data->ops->free(data);
+
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * unregister_trigger - Generic event_command @unreg implementation
+ * @glob: The raw string used to register the trigger
+ * @test: Trigger-specific data used to find the trigger to remove
+ * @file: The trace_event_file associated with the event
+ *
+ * Common implementation for event trigger unregistration.
+ *
+ * Usually used directly as the @unreg method in event command
+ * implementations.
+ */
+static void unregister_trigger(char *glob,
+ struct event_trigger_data *test,
+ struct trace_event_file *file)
+{
+ try_unregister_trigger(glob, test, file);
}
/*
@@ -1470,12 +1465,23 @@ register_snapshot_trigger(char *glob,
struct event_trigger_data *data,
struct trace_event_file *file)
{
- int ret = tracing_alloc_snapshot_instance(file->tr);
+ int ret = tracing_arm_snapshot(file->tr);
if (ret < 0)
return ret;
- return register_trigger(glob, data, file);
+ ret = register_trigger(glob, data, file);
+ if (ret < 0)
+ tracing_disarm_snapshot(file->tr);
+ return ret;
+}
+
+static void unregister_snapshot_trigger(char *glob,
+ struct event_trigger_data *data,
+ struct trace_event_file *file)
+{
+ if (try_unregister_trigger(glob, data, file))
+ tracing_disarm_snapshot(file->tr);
}
static int
@@ -1510,7 +1516,7 @@ static struct event_command trigger_snapshot_cmd = {
.trigger_type = ETT_SNAPSHOT,
.parse = event_trigger_parse,
.reg = register_snapshot_trigger,
- .unreg = unregister_trigger,
+ .unreg = unregister_snapshot_trigger,
.get_trigger_ops = snapshot_get_trigger_ops,
.set_filter = set_trigger_filter,
};
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index e76f5e1efdf2..97325fbd6283 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -34,7 +34,8 @@
/* Limit how long of an event name plus args within the subsystem. */
#define MAX_EVENT_DESC 512
-#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
+#define EVENT_NAME(user_event) ((user_event)->reg_name)
+#define EVENT_TP_NAME(user_event) ((user_event)->tracepoint.name)
#define MAX_FIELD_ARRAY_SIZE 1024
/*
@@ -54,10 +55,13 @@
* allows isolation for events by various means.
*/
struct user_event_group {
- char *system_name;
- struct hlist_node node;
- struct mutex reg_mutex;
+ char *system_name;
+ char *system_multi_name;
+ struct hlist_node node;
+ struct mutex reg_mutex;
DECLARE_HASHTABLE(register_table, 8);
+ /* ID that moves forward within the group for multi-event names */
+ u64 multi_id;
};
/* Group for init_user_ns mapping, top-most group */
@@ -78,6 +82,7 @@ static unsigned int current_user_events;
*/
struct user_event {
struct user_event_group *group;
+ char *reg_name;
struct tracepoint tracepoint;
struct trace_event_call call;
struct trace_event_class class;
@@ -127,6 +132,8 @@ struct user_event_enabler {
#define ENABLE_BIT(e) ((int)((e)->values & ENABLE_VAL_BIT_MASK))
+#define EVENT_MULTI_FORMAT(f) ((f) & USER_EVENT_REG_MULTI_FORMAT)
+
/* Used for asynchronous faulting in of pages */
struct user_event_enabler_fault {
struct work_struct work;
@@ -202,6 +209,8 @@ static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm);
static struct user_event_mm *user_event_mm_get_all(struct user_event *user);
static void user_event_mm_put(struct user_event_mm *mm);
static int destroy_user_event(struct user_event *user);
+static bool user_fields_match(struct user_event *user, int argc,
+ const char **argv);
static u32 user_event_key(char *name)
{
@@ -328,6 +337,7 @@ out:
static void user_event_group_destroy(struct user_event_group *group)
{
kfree(group->system_name);
+ kfree(group->system_multi_name);
kfree(group);
}
@@ -346,6 +356,11 @@ static char *user_event_group_system_name(void)
return system_name;
}
+static char *user_event_group_system_multi_name(void)
+{
+ return kstrdup(USER_EVENTS_MULTI_SYSTEM, GFP_KERNEL);
+}
+
static struct user_event_group *current_user_event_group(void)
{
return init_group;
@@ -365,6 +380,11 @@ static struct user_event_group *user_event_group_create(void)
if (!group->system_name)
goto error;
+ group->system_multi_name = user_event_group_system_multi_name();
+
+ if (!group->system_multi_name)
+ goto error;
+
mutex_init(&group->reg_mutex);
hash_init(group->register_table);
@@ -1480,6 +1500,11 @@ static int destroy_user_event(struct user_event *user)
hash_del(&user->node);
user_event_destroy_validators(user);
+
+ /* If we have different names, both must be freed */
+ if (EVENT_NAME(user) != EVENT_TP_NAME(user))
+ kfree(EVENT_TP_NAME(user));
+
kfree(user->call.print_fmt);
kfree(EVENT_NAME(user));
kfree(user);
@@ -1493,17 +1518,36 @@ static int destroy_user_event(struct user_event *user)
}
static struct user_event *find_user_event(struct user_event_group *group,
- char *name, u32 *outkey)
+ char *name, int argc, const char **argv,
+ u32 flags, u32 *outkey)
{
struct user_event *user;
u32 key = user_event_key(name);
*outkey = key;
- hash_for_each_possible(group->register_table, user, node, key)
- if (!strcmp(EVENT_NAME(user), name))
+ hash_for_each_possible(group->register_table, user, node, key) {
+ /*
+ * Single-format events shouldn't return multi-format
+ * events. Callers expect the underlying tracepoint to match
+ * the name exactly in these cases. Only check like-formats.
+ */
+ if (EVENT_MULTI_FORMAT(flags) != EVENT_MULTI_FORMAT(user->reg_flags))
+ continue;
+
+ if (strcmp(EVENT_NAME(user), name))
+ continue;
+
+ if (user_fields_match(user, argc, argv))
return user_event_get(user);
+ /* Scan others if this is a multi-format event */
+ if (EVENT_MULTI_FORMAT(flags))
+ continue;
+
+ return ERR_PTR(-EADDRINUSE);
+ }
+
return NULL;
}
@@ -1632,7 +1676,7 @@ static void update_enable_bit_for(struct user_event *user)
struct tracepoint *tp = &user->tracepoint;
char status = 0;
- if (atomic_read(&tp->key.enabled) > 0) {
+ if (static_key_enabled(&tp->key)) {
struct tracepoint_func *probe_func_ptr;
user_event_func_t probe_func;
@@ -1860,6 +1904,9 @@ static bool user_fields_match(struct user_event *user, int argc,
struct list_head *head = &user->fields;
int i = 0;
+ if (argc == 0)
+ return list_empty(head);
+
list_for_each_entry_reverse(field, head, link) {
if (!user_field_match(field, argc, argv, &i))
return false;
@@ -1877,13 +1924,15 @@ static bool user_event_match(const char *system, const char *event,
struct user_event *user = container_of(ev, struct user_event, devent);
bool match;
- match = strcmp(EVENT_NAME(user), event) == 0 &&
- (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0);
+ match = strcmp(EVENT_NAME(user), event) == 0;
- if (match && argc > 0)
+ if (match && system) {
+ match = strcmp(system, user->group->system_name) == 0 ||
+ strcmp(system, user->group->system_multi_name) == 0;
+ }
+
+ if (match)
match = user_fields_match(user, argc, argv);
- else if (match && argc == 0)
- match = list_empty(&user->fields);
return match;
}
@@ -1913,6 +1962,107 @@ static int user_event_trace_register(struct user_event *user)
return ret;
}
+static int user_event_set_tp_name(struct user_event *user)
+{
+ lockdep_assert_held(&user->group->reg_mutex);
+
+ if (EVENT_MULTI_FORMAT(user->reg_flags)) {
+ char *multi_name;
+
+ multi_name = kasprintf(GFP_KERNEL_ACCOUNT, "%s.%llx",
+ user->reg_name, user->group->multi_id);
+
+ if (!multi_name)
+ return -ENOMEM;
+
+ user->call.name = multi_name;
+ user->tracepoint.name = multi_name;
+
+ /* Inc to ensure unique multi-event name next time */
+ user->group->multi_id++;
+ } else {
+ /* Non Multi-format uses register name */
+ user->call.name = user->reg_name;
+ user->tracepoint.name = user->reg_name;
+ }
+
+ return 0;
+}
+
+/*
+ * Counts how many ';' without a trailing space are in the args.
+ */
+static int count_semis_no_space(char *args)
+{
+ int count = 0;
+
+ while ((args = strchr(args, ';'))) {
+ args++;
+
+ if (!isspace(*args))
+ count++;
+ }
+
+ return count;
+}
+
+/*
+ * Copies the arguments while ensuring all ';' have a trailing space.
+ */
+static char *insert_space_after_semis(char *args, int count)
+{
+ char *fixed, *pos;
+ int len;
+
+ len = strlen(args) + count;
+ fixed = kmalloc(len + 1, GFP_KERNEL);
+
+ if (!fixed)
+ return NULL;
+
+ pos = fixed;
+
+ /* Insert a space after ';' if there is no trailing space. */
+ while (*args) {
+ *pos = *args++;
+
+ if (*pos++ == ';' && !isspace(*args))
+ *pos++ = ' ';
+ }
+
+ *pos = '\0';
+
+ return fixed;
+}
+
+static char **user_event_argv_split(char *args, int *argc)
+{
+ char **split;
+ char *fixed;
+ int count;
+
+ /* Count how many ';' without a trailing space */
+ count = count_semis_no_space(args);
+
+ /* No fixup is required */
+ if (!count)
+ return argv_split(GFP_KERNEL, args, argc);
+
+ /* We must fixup 'field;field' to 'field; field' */
+ fixed = insert_space_after_semis(args, count);
+
+ if (!fixed)
+ return NULL;
+
+ /* We do a normal split afterwards */
+ split = argv_split(GFP_KERNEL, fixed, argc);
+
+ /* We can free since argv_split makes a copy */
+ kfree(fixed);
+
+ return split;
+}
+
/*
* Parses the event name, arguments and flags then registers if successful.
* The name buffer lifetime is owned by this method for success cases only.
@@ -1922,11 +2072,11 @@ static int user_event_parse(struct user_event_group *group, char *name,
char *args, char *flags,
struct user_event **newuser, int reg_flags)
{
- int ret;
- u32 key;
struct user_event *user;
+ char **argv = NULL;
int argc = 0;
- char **argv;
+ int ret;
+ u32 key;
/* Currently don't support any text based flags */
if (flags != NULL)
@@ -1935,41 +2085,34 @@ static int user_event_parse(struct user_event_group *group, char *name,
if (!user_event_capable(reg_flags))
return -EPERM;
+ if (args) {
+ argv = user_event_argv_split(args, &argc);
+
+ if (!argv)
+ return -ENOMEM;
+ }
+
/* Prevent dyn_event from racing */
mutex_lock(&event_mutex);
- user = find_user_event(group, name, &key);
+ user = find_user_event(group, name, argc, (const char **)argv,
+ reg_flags, &key);
mutex_unlock(&event_mutex);
- if (user) {
- if (args) {
- argv = argv_split(GFP_KERNEL, args, &argc);
- if (!argv) {
- ret = -ENOMEM;
- goto error;
- }
+ if (argv)
+ argv_free(argv);
- ret = user_fields_match(user, argc, (const char **)argv);
- argv_free(argv);
-
- } else
- ret = list_empty(&user->fields);
-
- if (ret) {
- *newuser = user;
- /*
- * Name is allocated by caller, free it since it already exists.
- * Caller only worries about failure cases for freeing.
- */
- kfree(name);
- } else {
- ret = -EADDRINUSE;
- goto error;
- }
+ if (IS_ERR(user))
+ return PTR_ERR(user);
+
+ if (user) {
+ *newuser = user;
+ /*
+ * Name is allocated by caller, free it since it already exists.
+ * Caller only worries about failure cases for freeing.
+ */
+ kfree(name);
return 0;
-error:
- user_event_put(user, false);
- return ret;
}
user = kzalloc(sizeof(*user), GFP_KERNEL_ACCOUNT);
@@ -1982,7 +2125,13 @@ error:
INIT_LIST_HEAD(&user->validators);
user->group = group;
- user->tracepoint.name = name;
+ user->reg_name = name;
+ user->reg_flags = reg_flags;
+
+ ret = user_event_set_tp_name(user);
+
+ if (ret)
+ goto put_user;
ret = user_event_parse_fields(user, args);
@@ -1996,11 +2145,14 @@ error:
user->call.data = user;
user->call.class = &user->class;
- user->call.name = name;
user->call.flags = TRACE_EVENT_FL_TRACEPOINT;
user->call.tp = &user->tracepoint;
user->call.event.funcs = &user_event_funcs;
- user->class.system = group->system_name;
+
+ if (EVENT_MULTI_FORMAT(user->reg_flags))
+ user->class.system = group->system_multi_name;
+ else
+ user->class.system = group->system_name;
user->class.fields_array = user_event_fields_array;
user->class.get_fields = user_event_get_fields;
@@ -2022,8 +2174,6 @@ error:
if (ret)
goto put_user_lock;
- user->reg_flags = reg_flags;
-
if (user->reg_flags & USER_EVENT_REG_PERSIST) {
/* Ensure we track self ref and caller ref (2) */
refcount_set(&user->refcnt, 2);
@@ -2047,30 +2197,43 @@ put_user:
user_event_destroy_fields(user);
user_event_destroy_validators(user);
kfree(user->call.print_fmt);
+
+ /* Caller frees reg_name on error, but not multi-name */
+ if (EVENT_NAME(user) != EVENT_TP_NAME(user))
+ kfree(EVENT_TP_NAME(user));
+
kfree(user);
return ret;
}
/*
- * Deletes a previously created event if it is no longer being used.
+ * Deletes previously created events if they are no longer being used.
*/
static int delete_user_event(struct user_event_group *group, char *name)
{
- u32 key;
- struct user_event *user = find_user_event(group, name, &key);
+ struct user_event *user;
+ struct hlist_node *tmp;
+ u32 key = user_event_key(name);
+ int ret = -ENOENT;
- if (!user)
- return -ENOENT;
+ /* Attempt to delete all event(s) with the name passed in */
+ hash_for_each_possible_safe(group->register_table, user, tmp, node, key) {
+ if (strcmp(EVENT_NAME(user), name))
+ continue;
- user_event_put(user, true);
+ if (!user_event_last_ref(user))
+ return -EBUSY;
- if (!user_event_last_ref(user))
- return -EBUSY;
+ if (!user_event_capable(user->reg_flags))
+ return -EPERM;
- if (!user_event_capable(user->reg_flags))
- return -EPERM;
+ ret = destroy_user_event(user);
- return destroy_user_event(user);
+ if (ret)
+ goto out;
+ }
+out:
+ return ret;
}
/*
@@ -2117,7 +2280,7 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
* It's possible key.enabled disables after this check, however
* we don't mind if a few events are included in this condition.
*/
- if (likely(atomic_read(&tp->key.enabled) > 0)) {
+ if (likely(static_key_enabled(&tp->key))) {
struct tracepoint_func *probe_func_ptr;
user_event_func_t probe_func;
struct iov_iter copy;
@@ -2628,7 +2791,7 @@ static int user_seq_show(struct seq_file *m, void *p)
hash_for_each(group->register_table, i, user, node) {
status = user->status;
- seq_printf(m, "%s", EVENT_NAME(user));
+ seq_printf(m, "%s", EVENT_TP_NAME(user));
if (status != 0)
seq_puts(m, " #");
@@ -2722,7 +2885,7 @@ err:
return -ENODEV;
}
-static int set_max_user_events_sysctl(struct ctl_table *table, int write,
+static int set_max_user_events_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -2736,7 +2899,7 @@ static int set_max_user_events_sysctl(struct ctl_table *table, int write,
return ret;
}
-static struct ctl_table user_event_sysctls[] = {
+static const struct ctl_table user_event_sysctls[] = {
{
.procname = "user_events_max",
.data = &max_user_events,
@@ -2744,7 +2907,6 @@ static struct ctl_table user_event_sysctls[] = {
.mode = 0644,
.proc_handler = set_max_user_events_sysctl,
},
- {}
};
static int __init trace_events_user_init(void)
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index 7d2ddbcfa377..985ff98272da 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -4,6 +4,7 @@
* Copyright (C) 2022 Google LLC.
*/
#define pr_fmt(fmt) "trace_fprobe: " fmt
+#include <asm/ptrace.h>
#include <linux/fprobe.h>
#include <linux/module.h>
@@ -20,6 +21,7 @@
#define FPROBE_EVENT_SYSTEM "fprobes"
#define TRACEPOINT_EVENT_SYSTEM "tracepoints"
#define RETHOOK_MAXACTIVE_MAX 4096
+#define TRACEPOINT_STUB ERR_PTR(-ENOENT)
static int trace_fprobe_create(const char *raw_command);
static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev);
@@ -129,10 +131,10 @@ static bool trace_fprobe_is_registered(struct trace_fprobe *tf)
* from user space.
*/
static int
-process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
- void *base)
+process_fetch_insn(struct fetch_insn *code, void *rec, void *edata,
+ void *dest, void *base)
{
- struct pt_regs *regs = rec;
+ struct ftrace_regs *fregs = rec;
unsigned long val;
int ret;
@@ -140,17 +142,20 @@ retry:
/* 1st stage: get value from context */
switch (code->op) {
case FETCH_OP_STACK:
- val = regs_get_kernel_stack_nth(regs, code->param);
+ val = ftrace_regs_get_kernel_stack_nth(fregs, code->param);
break;
case FETCH_OP_STACKP:
- val = kernel_stack_pointer(regs);
+ val = ftrace_regs_get_stack_pointer(fregs);
break;
case FETCH_OP_RETVAL:
- val = regs_return_value(regs);
+ val = ftrace_regs_get_return_value(fregs);
break;
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
case FETCH_OP_ARG:
- val = regs_get_kernel_argument(regs, code->param);
+ val = ftrace_regs_get_argument(fregs, code->param);
+ break;
+ case FETCH_OP_EDATA:
+ val = *(unsigned long *)((unsigned long)edata + code->offset);
break;
#endif
case FETCH_NOP_SYMBOL: /* Ignore a place holder */
@@ -170,7 +175,7 @@ NOKPROBE_SYMBOL(process_fetch_insn)
/* function entry handler */
static nokprobe_inline void
__fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
- struct pt_regs *regs,
+ struct ftrace_regs *fregs,
struct trace_event_file *trace_file)
{
struct fentry_trace_entry_head *entry;
@@ -184,37 +189,80 @@ __fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
if (trace_trigger_soft_disabled(trace_file))
return;
- dsize = __get_data_size(&tf->tp, regs);
+ dsize = __get_data_size(&tf->tp, fregs, NULL);
entry = trace_event_buffer_reserve(&fbuffer, trace_file,
sizeof(*entry) + tf->tp.size + dsize);
if (!entry)
return;
- fbuffer.regs = regs;
+ fbuffer.regs = ftrace_get_regs(fregs);
entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
entry->ip = entry_ip;
- store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tf->tp, fregs, NULL, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
}
static void
fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
- struct pt_regs *regs)
+ struct ftrace_regs *fregs)
{
struct event_file_link *link;
trace_probe_for_each_link_rcu(link, &tf->tp)
- __fentry_trace_func(tf, entry_ip, regs, link->file);
+ __fentry_trace_func(tf, entry_ip, fregs, link->file);
}
NOKPROBE_SYMBOL(fentry_trace_func);
-/* Kretprobe handler */
+static nokprobe_inline
+void store_fprobe_entry_data(void *edata, struct trace_probe *tp, struct ftrace_regs *fregs)
+{
+ struct probe_entry_arg *earg = tp->entry_arg;
+ unsigned long val = 0;
+ int i;
+
+ if (!earg)
+ return;
+
+ for (i = 0; i < earg->size; i++) {
+ struct fetch_insn *code = &earg->code[i];
+
+ switch (code->op) {
+ case FETCH_OP_ARG:
+ val = ftrace_regs_get_argument(fregs, code->param);
+ break;
+ case FETCH_OP_ST_EDATA:
+ *(unsigned long *)((unsigned long)edata + code->offset) = val;
+ break;
+ case FETCH_OP_END:
+ goto end;
+ default:
+ break;
+ }
+ }
+end:
+ return;
+}
+
+/* function exit handler */
+static int trace_fprobe_entry_handler(struct fprobe *fp, unsigned long entry_ip,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
+ void *entry_data)
+{
+ struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp);
+
+ if (tf->tp.entry_arg)
+ store_fprobe_entry_data(entry_data, &tf->tp, fregs);
+
+ return 0;
+}
+NOKPROBE_SYMBOL(trace_fprobe_entry_handler)
+
static nokprobe_inline void
__fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
- struct trace_event_file *trace_file)
+ unsigned long ret_ip, struct ftrace_regs *fregs,
+ void *entry_data, struct trace_event_file *trace_file)
{
struct fexit_trace_entry_head *entry;
struct trace_event_buffer fbuffer;
@@ -227,60 +275,63 @@ __fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
if (trace_trigger_soft_disabled(trace_file))
return;
- dsize = __get_data_size(&tf->tp, regs);
+ dsize = __get_data_size(&tf->tp, fregs, entry_data);
entry = trace_event_buffer_reserve(&fbuffer, trace_file,
sizeof(*entry) + tf->tp.size + dsize);
if (!entry)
return;
- fbuffer.regs = regs;
+ fbuffer.regs = ftrace_get_regs(fregs);
entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
entry->func = entry_ip;
entry->ret_ip = ret_ip;
- store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tf->tp, fregs, entry_data, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
}
static void
fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs)
+ unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data)
{
struct event_file_link *link;
trace_probe_for_each_link_rcu(link, &tf->tp)
- __fexit_trace_func(tf, entry_ip, ret_ip, regs, link->file);
+ __fexit_trace_func(tf, entry_ip, ret_ip, fregs, entry_data, link->file);
}
NOKPROBE_SYMBOL(fexit_trace_func);
#ifdef CONFIG_PERF_EVENTS
static int fentry_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
- struct pt_regs *regs)
+ struct ftrace_regs *fregs)
{
struct trace_event_call *call = trace_probe_event_call(&tf->tp);
struct fentry_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
+ struct pt_regs *regs;
int rctx;
head = this_cpu_ptr(call->perf_events);
if (hlist_empty(head))
return 0;
- dsize = __get_data_size(&tf->tp, regs);
+ dsize = __get_data_size(&tf->tp, fregs, NULL);
__size = sizeof(*entry) + tf->tp.size + dsize;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- entry = perf_trace_buf_alloc(size, NULL, &rctx);
+ entry = perf_trace_buf_alloc(size, &regs, &rctx);
if (!entry)
return 0;
+ regs = ftrace_fill_perf_regs(fregs, regs);
+
entry->ip = entry_ip;
memset(&entry[1], 0, dsize);
- store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tf->tp, fregs, NULL, sizeof(*entry), dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
return 0;
@@ -289,30 +340,34 @@ NOKPROBE_SYMBOL(fentry_perf_func);
static void
fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs)
+ unsigned long ret_ip, struct ftrace_regs *fregs,
+ void *entry_data)
{
struct trace_event_call *call = trace_probe_event_call(&tf->tp);
struct fexit_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
+ struct pt_regs *regs;
int rctx;
head = this_cpu_ptr(call->perf_events);
if (hlist_empty(head))
return;
- dsize = __get_data_size(&tf->tp, regs);
+ dsize = __get_data_size(&tf->tp, fregs, entry_data);
__size = sizeof(*entry) + tf->tp.size + dsize;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- entry = perf_trace_buf_alloc(size, NULL, &rctx);
+ entry = perf_trace_buf_alloc(size, &regs, &rctx);
if (!entry)
return;
+ regs = ftrace_fill_perf_regs(fregs, regs);
+
entry->func = entry_ip;
entry->ret_ip = ret_ip;
- store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tf->tp, fregs, entry_data, sizeof(*entry), dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
}
@@ -320,33 +375,34 @@ NOKPROBE_SYMBOL(fexit_perf_func);
#endif /* CONFIG_PERF_EVENTS */
static int fentry_dispatcher(struct fprobe *fp, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *entry_data)
{
struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp);
int ret = 0;
if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE))
- fentry_trace_func(tf, entry_ip, regs);
+ fentry_trace_func(tf, entry_ip, fregs);
+
#ifdef CONFIG_PERF_EVENTS
if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE))
- ret = fentry_perf_func(tf, entry_ip, regs);
+ ret = fentry_perf_func(tf, entry_ip, fregs);
#endif
return ret;
}
NOKPROBE_SYMBOL(fentry_dispatcher);
static void fexit_dispatcher(struct fprobe *fp, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *entry_data)
{
struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp);
if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE))
- fexit_trace_func(tf, entry_ip, ret_ip, regs);
+ fexit_trace_func(tf, entry_ip, ret_ip, fregs, entry_data);
#ifdef CONFIG_PERF_EVENTS
if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE))
- fexit_perf_func(tf, entry_ip, ret_ip, regs);
+ fexit_perf_func(tf, entry_ip, ret_ip, fregs, entry_data);
#endif
}
NOKPROBE_SYMBOL(fexit_dispatcher);
@@ -360,6 +416,9 @@ static void free_trace_fprobe(struct trace_fprobe *tf)
}
}
+/* Since alloc_trace_fprobe() can return error, check the pointer is ERR too. */
+DEFINE_FREE(free_trace_fprobe, struct trace_fprobe *, if (!IS_ERR_OR_NULL(_T)) free_trace_fprobe(_T))
+
/*
* Allocate new trace_probe and initialize it (including fprobe).
*/
@@ -367,10 +426,10 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group,
const char *event,
const char *symbol,
struct tracepoint *tpoint,
- int maxactive,
+ struct module *mod,
int nargs, bool is_return)
{
- struct trace_fprobe *tf;
+ struct trace_fprobe *tf __free(free_trace_fprobe) = NULL;
int ret = -ENOMEM;
tf = kzalloc(struct_size(tf, tp.args, nargs), GFP_KERNEL);
@@ -379,7 +438,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group,
tf->symbol = kstrdup(symbol, GFP_KERNEL);
if (!tf->symbol)
- goto error;
+ return ERR_PTR(-ENOMEM);
if (is_return)
tf->fp.exit_handler = fexit_dispatcher;
@@ -387,17 +446,14 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group,
tf->fp.entry_handler = fentry_dispatcher;
tf->tpoint = tpoint;
- tf->fp.nr_maxactive = maxactive;
+ tf->mod = mod;
- ret = trace_probe_init(&tf->tp, event, group, false);
+ ret = trace_probe_init(&tf->tp, event, group, false, nargs);
if (ret < 0)
- goto error;
+ return ERR_PTR(ret);
dyn_event_init(&tf->devent, &trace_fprobe_ops);
- return tf;
-error:
- free_trace_fprobe(tf);
- return ERR_PTR(ret);
+ return_ptr(tf);
}
static struct trace_fprobe *find_trace_fprobe(const char *event,
@@ -654,6 +710,24 @@ static int unregister_fprobe_event(struct trace_fprobe *tf)
return trace_probe_unregister_event_call(&tf->tp);
}
+static int __regsiter_tracepoint_fprobe(struct trace_fprobe *tf)
+{
+ struct tracepoint *tpoint = tf->tpoint;
+ unsigned long ip = (unsigned long)tpoint->probestub;
+ int ret;
+
+ /*
+ * Here, we do 2 steps to enable fprobe on a tracepoint.
+ * At first, put __probestub_##TP function on the tracepoint
+ * and put a fprobe on the stub function.
+ */
+ ret = tracepoint_probe_register_prio_may_exist(tpoint,
+ tpoint->probestub, NULL, 0);
+ if (ret < 0)
+ return ret;
+ return register_fprobe_ips(&tf->fp, &ip, 1);
+}
+
/* Internal register function - just handle fprobe and flags */
static int __register_trace_fprobe(struct trace_fprobe *tf)
{
@@ -680,18 +754,12 @@ static int __register_trace_fprobe(struct trace_fprobe *tf)
tf->fp.flags |= FPROBE_FL_DISABLED;
if (trace_fprobe_is_tracepoint(tf)) {
- struct tracepoint *tpoint = tf->tpoint;
- unsigned long ip = (unsigned long)tpoint->probestub;
- /*
- * Here, we do 2 steps to enable fprobe on a tracepoint.
- * At first, put __probestub_##TP function on the tracepoint
- * and put a fprobe on the stub function.
- */
- ret = tracepoint_probe_register_prio_may_exist(tpoint,
- tpoint->probestub, NULL, 0);
- if (ret < 0)
- return ret;
- return register_fprobe_ips(&tf->fp, &ip, 1);
+
+ /* This tracepoint is not loaded yet */
+ if (tf->tpoint == TRACEPOINT_STUB)
+ return 0;
+
+ return __regsiter_tracepoint_fprobe(tf);
}
/* TODO: handle filter, nofilter or symbol list */
@@ -812,14 +880,12 @@ static int register_trace_fprobe(struct trace_fprobe *tf)
struct trace_fprobe *old_tf;
int ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
old_tf = find_trace_fprobe(trace_probe_name(&tf->tp),
trace_probe_group_name(&tf->tp));
- if (old_tf) {
- ret = append_trace_fprobe(tf, old_tf);
- goto end;
- }
+ if (old_tf)
+ return append_trace_fprobe(tf, old_tf);
/* Register new event */
ret = register_fprobe_event(tf);
@@ -829,7 +895,7 @@ static int register_trace_fprobe(struct trace_fprobe *tf)
trace_probe_log_err(0, EVENT_EXIST);
} else
pr_warn("Failed to register probe event(%d)\n", ret);
- goto end;
+ return ret;
}
/* Register fprobe */
@@ -839,29 +905,106 @@ static int register_trace_fprobe(struct trace_fprobe *tf)
else
dyn_event_add(&tf->devent, trace_probe_event_call(&tf->tp));
-end:
- mutex_unlock(&event_mutex);
return ret;
}
+struct __find_tracepoint_cb_data {
+ const char *tp_name;
+ struct tracepoint *tpoint;
+ struct module *mod;
+};
+
+static void __find_tracepoint_module_cb(struct tracepoint *tp, struct module *mod, void *priv)
+{
+ struct __find_tracepoint_cb_data *data = priv;
+
+ if (!data->tpoint && !strcmp(data->tp_name, tp->name)) {
+ data->tpoint = tp;
+ if (!data->mod)
+ data->mod = mod;
+ }
+}
+
+static void __find_tracepoint_cb(struct tracepoint *tp, void *priv)
+{
+ struct __find_tracepoint_cb_data *data = priv;
+
+ if (!data->tpoint && !strcmp(data->tp_name, tp->name))
+ data->tpoint = tp;
+}
+
+/* Find a tracepoint from kernel and module. */
+static struct tracepoint *find_tracepoint(const char *tp_name,
+ struct module **tp_mod)
+{
+ struct __find_tracepoint_cb_data data = {
+ .tp_name = tp_name,
+ .mod = NULL,
+ };
+
+ for_each_kernel_tracepoint(__find_tracepoint_cb, &data);
+
+ if (!data.tpoint && IS_ENABLED(CONFIG_MODULES)) {
+ for_each_module_tracepoint(__find_tracepoint_module_cb, &data);
+ *tp_mod = data.mod;
+ }
+
+ return data.tpoint;
+}
+
#ifdef CONFIG_MODULES
+static void reenable_trace_fprobe(struct trace_fprobe *tf)
+{
+ struct trace_probe *tp = &tf->tp;
+
+ list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
+ __enable_trace_fprobe(tf);
+ }
+}
+
+/* Find a tracepoint from specified module. */
+static struct tracepoint *find_tracepoint_in_module(struct module *mod,
+ const char *tp_name)
+{
+ struct __find_tracepoint_cb_data data = {
+ .tp_name = tp_name,
+ .mod = mod,
+ };
+
+ for_each_tracepoint_in_module(mod, __find_tracepoint_module_cb, &data);
+ return data.tpoint;
+}
+
static int __tracepoint_probe_module_cb(struct notifier_block *self,
unsigned long val, void *data)
{
struct tp_module *tp_mod = data;
+ struct tracepoint *tpoint;
struct trace_fprobe *tf;
struct dyn_event *pos;
- if (val != MODULE_STATE_GOING)
+ if (val != MODULE_STATE_GOING && val != MODULE_STATE_COMING)
return NOTIFY_DONE;
mutex_lock(&event_mutex);
for_each_trace_fprobe(tf, pos) {
- if (tp_mod->mod == tf->mod) {
- tracepoint_probe_unregister(tf->tpoint,
+ if (val == MODULE_STATE_COMING && tf->tpoint == TRACEPOINT_STUB) {
+ tpoint = find_tracepoint_in_module(tp_mod->mod, tf->symbol);
+ if (tpoint) {
+ tf->tpoint = tpoint;
+ tf->mod = tp_mod->mod;
+ if (!WARN_ON_ONCE(__regsiter_tracepoint_fprobe(tf)) &&
+ trace_probe_is_enabled(&tf->tp))
+ reenable_trace_fprobe(tf);
+ }
+ } else if (val == MODULE_STATE_GOING && tp_mod->mod == tf->mod) {
+ unregister_fprobe(&tf->fp);
+ if (trace_fprobe_is_tracepoint(tf)) {
+ tracepoint_probe_unregister(tf->tpoint,
tf->tpoint->probestub, NULL);
- tf->tpoint = NULL;
- tf->mod = NULL;
+ tf->tpoint = TRACEPOINT_STUB;
+ tf->mod = NULL;
+ }
}
}
mutex_unlock(&event_mutex);
@@ -874,30 +1017,6 @@ static struct notifier_block tracepoint_module_nb = {
};
#endif /* CONFIG_MODULES */
-struct __find_tracepoint_cb_data {
- const char *tp_name;
- struct tracepoint *tpoint;
-};
-
-static void __find_tracepoint_cb(struct tracepoint *tp, void *priv)
-{
- struct __find_tracepoint_cb_data *data = priv;
-
- if (!data->tpoint && !strcmp(data->tp_name, tp->name))
- data->tpoint = tp;
-}
-
-static struct tracepoint *find_tracepoint(const char *tp_name)
-{
- struct __find_tracepoint_cb_data data = {
- .tp_name = tp_name,
- };
-
- for_each_kernel_tracepoint(__find_tracepoint_cb, &data);
-
- return data.tpoint;
-}
-
static int parse_symbol_and_return(int argc, const char *argv[],
char **symbol, bool *is_return,
bool is_tracepoint)
@@ -923,6 +1042,19 @@ static int parse_symbol_and_return(int argc, const char *argv[],
if (*is_return)
return 0;
+ if (is_tracepoint) {
+ tmp = *symbol;
+ while (*tmp && (isalnum(*tmp) || *tmp == '_'))
+ tmp++;
+ if (*tmp) {
+ /* find a wrong character. */
+ trace_probe_log_err(tmp - *symbol, BAD_TP_NAME);
+ kfree(*symbol);
+ *symbol = NULL;
+ return -EINVAL;
+ }
+ }
+
/* If there is $retval, this should be a return fprobe. */
for (i = 2; i < argc; i++) {
tmp = strstr(argv[i], "$retval");
@@ -930,6 +1062,8 @@ static int parse_symbol_and_return(int argc, const char *argv[],
if (is_tracepoint) {
trace_probe_log_set_index(i);
trace_probe_log_err(tmp - argv[i], RETVAL_ON_PROBE);
+ kfree(*symbol);
+ *symbol = NULL;
return -EINVAL;
}
*is_return = true;
@@ -939,7 +1073,10 @@ static int parse_symbol_and_return(int argc, const char *argv[],
return 0;
}
-static int __trace_fprobe_create(int argc, const char *argv[])
+DEFINE_FREE(module_put, struct module *, if (_T) module_put(_T))
+
+static int trace_fprobe_create_internal(int argc, const char *argv[],
+ struct traceprobe_parse_context *ctx)
{
/*
* Argument syntax:
@@ -965,22 +1102,20 @@ static int __trace_fprobe_create(int argc, const char *argv[])
* Type of args:
* FETCHARG:TYPE : use TYPE instead of unsigned long.
*/
- struct trace_fprobe *tf = NULL;
- int i, len, new_argc = 0, ret = 0;
+ struct trace_fprobe *tf __free(free_trace_fprobe) = NULL;
+ int i, new_argc = 0, ret = 0;
bool is_return = false;
- char *symbol = NULL;
+ char *symbol __free(kfree) = NULL;
const char *event = NULL, *group = FPROBE_EVENT_SYSTEM;
- const char **new_argv = NULL;
- int maxactive = 0;
+ const char **new_argv __free(kfree) = NULL;
char buf[MAX_EVENT_NAME_LEN];
char gbuf[MAX_EVENT_NAME_LEN];
char sbuf[KSYM_NAME_LEN];
char abuf[MAX_BTF_ARGS_LEN];
+ char *dbuf __free(kfree) = NULL;
bool is_tracepoint = false;
+ struct module *tp_mod __free(module_put) = NULL;
struct tracepoint *tpoint = NULL;
- struct traceprobe_parse_context ctx = {
- .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE,
- };
if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2)
return -ECANCELED;
@@ -990,35 +1125,13 @@ static int __trace_fprobe_create(int argc, const char *argv[])
group = TRACEPOINT_EVENT_SYSTEM;
}
- trace_probe_log_init("trace_fprobe", argc, argv);
-
- event = strchr(&argv[0][1], ':');
- if (event)
- event++;
-
- if (isdigit(argv[0][1])) {
- if (event)
- len = event - &argv[0][1] - 1;
- else
- len = strlen(&argv[0][1]);
- if (len > MAX_EVENT_NAME_LEN - 1) {
- trace_probe_log_err(1, BAD_MAXACT);
- goto parse_error;
- }
- memcpy(buf, &argv[0][1], len);
- buf[len] = '\0';
- ret = kstrtouint(buf, 0, &maxactive);
- if (ret || !maxactive) {
+ if (argv[0][1] != '\0') {
+ if (argv[0][1] != ':') {
+ trace_probe_log_set_index(0);
trace_probe_log_err(1, BAD_MAXACT);
- goto parse_error;
- }
- /* fprobe rethook instances are iterated over via a list. The
- * maximum should stay reasonable.
- */
- if (maxactive > RETHOOK_MAXACTIVE_MAX) {
- trace_probe_log_err(1, MAXACT_TOO_BIG);
- goto parse_error;
+ return -EINVAL;
}
+ event = &argv[0][2];
}
trace_probe_log_set_index(1);
@@ -1026,20 +1139,14 @@ static int __trace_fprobe_create(int argc, const char *argv[])
/* a symbol(or tracepoint) must be specified */
ret = parse_symbol_and_return(argc, argv, &symbol, &is_return, is_tracepoint);
if (ret < 0)
- goto parse_error;
-
- if (!is_return && maxactive) {
- trace_probe_log_set_index(0);
- trace_probe_log_err(1, BAD_MAXACT_TYPE);
- goto parse_error;
- }
+ return -EINVAL;
trace_probe_log_set_index(0);
if (event) {
ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
if (ret)
- goto parse_error;
+ return -EINVAL;
}
if (!event) {
@@ -1055,64 +1162,83 @@ static int __trace_fprobe_create(int argc, const char *argv[])
}
if (is_return)
- ctx.flags |= TPARG_FL_RETURN;
+ ctx->flags |= TPARG_FL_RETURN;
else
- ctx.flags |= TPARG_FL_FENTRY;
+ ctx->flags |= TPARG_FL_FENTRY;
if (is_tracepoint) {
- ctx.flags |= TPARG_FL_TPOINT;
- tpoint = find_tracepoint(symbol);
- if (!tpoint) {
- trace_probe_log_set_index(1);
- trace_probe_log_err(0, NO_TRACEPOINT);
- goto parse_error;
+ ctx->flags |= TPARG_FL_TPOINT;
+ tpoint = find_tracepoint(symbol, &tp_mod);
+ /* lock module until register this tprobe. */
+ if (tp_mod && !try_module_get(tp_mod)) {
+ tpoint = NULL;
+ tp_mod = NULL;
}
- ctx.funcname = kallsyms_lookup(
+ if (tpoint) {
+ ctx->funcname = kallsyms_lookup(
(unsigned long)tpoint->probestub,
NULL, NULL, NULL, sbuf);
+ } else if (IS_ENABLED(CONFIG_MODULES)) {
+ /* This *may* be loaded afterwards */
+ tpoint = TRACEPOINT_STUB;
+ ctx->funcname = symbol;
+ } else {
+ trace_probe_log_set_index(1);
+ trace_probe_log_err(0, NO_TRACEPOINT);
+ return -EINVAL;
+ }
} else
- ctx.funcname = symbol;
+ ctx->funcname = symbol;
argc -= 2; argv += 2;
new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc,
- abuf, MAX_BTF_ARGS_LEN, &ctx);
- if (IS_ERR(new_argv)) {
- ret = PTR_ERR(new_argv);
- new_argv = NULL;
- goto out;
- }
+ abuf, MAX_BTF_ARGS_LEN, ctx);
+ if (IS_ERR(new_argv))
+ return PTR_ERR(new_argv);
if (new_argv) {
argc = new_argc;
argv = new_argv;
}
+ if (argc > MAX_TRACE_ARGS)
+ return -E2BIG;
+
+ ret = traceprobe_expand_dentry_args(argc, argv, &dbuf);
+ if (ret)
+ return ret;
/* setup a probe */
- tf = alloc_trace_fprobe(group, event, symbol, tpoint, maxactive,
+ tf = alloc_trace_fprobe(group, event, symbol, tpoint, tp_mod,
argc, is_return);
if (IS_ERR(tf)) {
ret = PTR_ERR(tf);
/* This must return -ENOMEM, else there is a bug */
WARN_ON_ONCE(ret != -ENOMEM);
- goto out; /* We know tf is not allocated */
+ return ret;
}
- if (is_tracepoint)
- tf->mod = __module_text_address(
- (unsigned long)tf->tpoint->probestub);
-
/* parse arguments */
- for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ for (i = 0; i < argc; i++) {
trace_probe_log_set_index(i + 2);
- ctx.offset = 0;
- ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], &ctx);
+ ctx->offset = 0;
+ ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], ctx);
if (ret)
- goto error; /* This can be -ENOMEM */
+ return ret; /* This can be -ENOMEM */
+ }
+
+ if (is_return && tf->tp.entry_arg) {
+ tf->fp.entry_handler = trace_fprobe_entry_handler;
+ tf->fp.entry_data_size = traceprobe_get_entry_data_size(&tf->tp);
+ if (ALIGN(tf->fp.entry_data_size, sizeof(long)) > MAX_FPROBE_DATA_SIZE) {
+ trace_probe_log_set_index(2);
+ trace_probe_log_err(0, TOO_MANY_EARGS);
+ return -E2BIG;
+ }
}
ret = traceprobe_set_print_fmt(&tf->tp,
is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL);
if (ret < 0)
- goto error;
+ return ret;
ret = register_trace_fprobe(tf);
if (ret) {
@@ -1123,26 +1249,32 @@ static int __trace_fprobe_create(int argc, const char *argv[])
trace_probe_log_err(0, BAD_PROBE_ADDR);
else if (ret != -ENOMEM && ret != -EEXIST)
trace_probe_log_err(0, FAIL_REG_PROBE);
- goto error;
+ return -EINVAL;
}
-out:
+ /* 'tf' is successfully registered. To avoid freeing, assign NULL. */
+ tf = NULL;
+
+ return 0;
+}
+
+static int trace_fprobe_create_cb(int argc, const char *argv[])
+{
+ struct traceprobe_parse_context ctx = {
+ .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE,
+ };
+ int ret;
+
+ trace_probe_log_init("trace_fprobe", argc, argv);
+ ret = trace_fprobe_create_internal(argc, argv, &ctx);
traceprobe_finish_parse(&ctx);
trace_probe_log_clear();
- kfree(new_argv);
- kfree(symbol);
return ret;
-
-parse_error:
- ret = -EINVAL;
-error:
- free_trace_fprobe(tf);
- goto out;
}
static int trace_fprobe_create(const char *raw_command)
{
- return trace_probe_create(raw_command, __trace_fprobe_create);
+ return trace_probe_create(raw_command, trace_fprobe_create_cb);
}
static int trace_fprobe_release(struct dyn_event *ev)
@@ -1164,8 +1296,6 @@ static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev)
seq_putc(m, 't');
else
seq_putc(m, 'f');
- if (trace_fprobe_is_return(tf) && tf->fp.nr_maxactive)
- seq_printf(m, "%d", tf->fp.nr_maxactive);
seq_printf(m, ":%s/%s", trace_probe_group_name(&tf->tp),
trace_probe_name(&tf->tp));
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 9f1bfbe105e8..df56f9b76010 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -80,6 +80,7 @@ void ftrace_free_ftrace_ops(struct trace_array *tr)
int ftrace_create_function_files(struct trace_array *tr,
struct dentry *parent)
{
+ int ret;
/*
* The top level array uses the "global_ops", and the files are
* created on boot up.
@@ -90,6 +91,12 @@ int ftrace_create_function_files(struct trace_array *tr,
if (!tr->ops)
return -EINVAL;
+ ret = allocate_fgraph_ops(tr, tr->ops);
+ if (ret) {
+ kfree(tr->ops);
+ return ret;
+ }
+
ftrace_create_filter_files(tr->ops, parent);
return 0;
@@ -99,6 +106,7 @@ void ftrace_destroy_function_files(struct trace_array *tr)
{
ftrace_destroy_filter_files(tr->ops);
ftrace_free_ftrace_ops(tr);
+ free_fgraph_ops(tr);
}
static ftrace_func_t select_trace_function(u32 flags_val)
@@ -168,6 +176,28 @@ static void function_trace_start(struct trace_array *tr)
tracing_reset_online_cpus(&tr->array_buffer);
}
+/* fregs are guaranteed not to be NULL if HAVE_DYNAMIC_FTRACE_WITH_ARGS is set */
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) && defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS)
+static __always_inline unsigned long
+function_get_true_parent_ip(unsigned long parent_ip, struct ftrace_regs *fregs)
+{
+ unsigned long true_parent_ip;
+ int idx = 0;
+
+ true_parent_ip = parent_ip;
+ if (unlikely(parent_ip == (unsigned long)&return_to_handler) && fregs)
+ true_parent_ip = ftrace_graph_ret_addr(current, &idx, parent_ip,
+ (unsigned long *)ftrace_regs_get_stack_pointer(fregs));
+ return true_parent_ip;
+}
+#else
+static __always_inline unsigned long
+function_get_true_parent_ip(unsigned long parent_ip, struct ftrace_regs *fregs)
+{
+ return parent_ip;
+}
+#endif
+
static void
function_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs)
@@ -176,7 +206,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
struct trace_array_cpu *data;
unsigned int trace_ctx;
int bit;
- int cpu;
if (unlikely(!tr->function_enabled))
return;
@@ -185,10 +214,11 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
if (bit < 0)
return;
- trace_ctx = tracing_gen_ctx();
+ parent_ip = function_get_true_parent_ip(parent_ip, fregs);
- cpu = smp_processor_id();
- data = per_cpu_ptr(tr->array_buffer.data, cpu);
+ trace_ctx = tracing_gen_ctx_dec();
+
+ data = this_cpu_ptr(tr->array_buffer.data);
if (!atomic_read(&data->disabled))
trace_function(tr, ip, parent_ip, trace_ctx);
@@ -223,6 +253,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
long disabled;
int cpu;
unsigned int trace_ctx;
+ int skip = STACK_SKIP;
if (unlikely(!tr->function_enabled))
return;
@@ -232,6 +263,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
* recursive protection is performed.
*/
local_irq_save(flags);
+ parent_ip = function_get_true_parent_ip(parent_ip, fregs);
cpu = raw_smp_processor_id();
data = per_cpu_ptr(tr->array_buffer.data, cpu);
disabled = atomic_inc_return(&data->disabled);
@@ -239,7 +271,11 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
if (likely(disabled == 1)) {
trace_ctx = tracing_gen_ctx_flags(flags);
trace_function(tr, ip, parent_ip, trace_ctx);
- __trace_stack(tr, trace_ctx, STACK_SKIP);
+#ifdef CONFIG_UNWINDER_FRAME_POINTER
+ if (ftrace_pids_enabled(op))
+ skip++;
+#endif
+ __trace_stack(tr, trace_ctx, skip);
}
atomic_dec(&data->disabled);
@@ -285,9 +321,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
struct trace_array *tr = op->private;
struct trace_array_cpu *data;
unsigned int trace_ctx;
- unsigned long flags;
int bit;
- int cpu;
if (unlikely(!tr->function_enabled))
return;
@@ -296,8 +330,8 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
if (bit < 0)
return;
- cpu = smp_processor_id();
- data = per_cpu_ptr(tr->array_buffer.data, cpu);
+ parent_ip = function_get_true_parent_ip(parent_ip, fregs);
+ data = this_cpu_ptr(tr->array_buffer.data);
if (atomic_read(&data->disabled))
goto out;
@@ -308,12 +342,11 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
* TODO: think about a solution that is better than just hoping to be
* lucky.
*/
- last_info = per_cpu_ptr(tr->last_func_repeats, cpu);
+ last_info = this_cpu_ptr(tr->last_func_repeats);
if (is_repeat_check(tr, last_info, ip, parent_ip))
goto out;
- local_save_flags(flags);
- trace_ctx = tracing_gen_ctx_flags(flags);
+ trace_ctx = tracing_gen_ctx_dec();
process_repeats(tr, ip, parent_ip, last_info, trace_ctx);
trace_function(tr, ip, parent_ip, trace_ctx);
@@ -343,6 +376,7 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
* recursive protection is performed.
*/
local_irq_save(flags);
+ parent_ip = function_get_true_parent_ip(parent_ip, fregs);
cpu = raw_smp_processor_id();
data = per_cpu_ptr(tr->array_buffer.data, cpu);
disabled = atomic_inc_return(&data->disabled);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index c35fbaab2a47..136c750b0b4d 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -31,7 +31,10 @@ struct fgraph_data {
struct fgraph_cpu_data __percpu *cpu_data;
/* Place to preserve last processed entry. */
- struct ftrace_graph_ent_entry ent;
+ union {
+ struct ftrace_graph_ent_entry ent;
+ struct fgraph_retaddr_ent_entry rent;
+ } ent;
struct ftrace_graph_ret_entry ret;
int failed;
int cpu;
@@ -64,6 +67,10 @@ static struct tracer_opt trace_opts[] = {
/* Display function return value in hexadecimal format ? */
{ TRACER_OPT(funcgraph-retval-hex, TRACE_GRAPH_PRINT_RETVAL_HEX) },
#endif
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+ /* Display function return address ? */
+ { TRACER_OPT(funcgraph-retaddr, TRACE_GRAPH_PRINT_RETADDR) },
+#endif
/* Include sleep time (scheduled out) between entry and return */
{ TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) },
@@ -83,7 +90,10 @@ static struct tracer_flags tracer_flags = {
.opts = trace_opts
};
-static struct trace_array *graph_array;
+static bool tracer_flags_is_set(u32 flags)
+{
+ return (tracer_flags.val & flags) == flags;
+}
/*
* DURATION column is being also used to display IRQ signs,
@@ -104,7 +114,6 @@ int __trace_graph_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
unsigned int trace_ctx)
{
- struct trace_event_call *call = &event_funcgraph_entry;
struct ring_buffer_event *event;
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ftrace_graph_ent_entry *entry;
@@ -115,12 +124,43 @@ int __trace_graph_entry(struct trace_array *tr,
return 0;
entry = ring_buffer_event_data(event);
entry->graph_ent = *trace;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit_nostack(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
return 1;
}
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+int __trace_graph_retaddr_entry(struct trace_array *tr,
+ struct ftrace_graph_ent *trace,
+ unsigned int trace_ctx,
+ unsigned long retaddr)
+{
+ struct ring_buffer_event *event;
+ struct trace_buffer *buffer = tr->array_buffer.buffer;
+ struct fgraph_retaddr_ent_entry *entry;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RETADDR_ENT,
+ sizeof(*entry), trace_ctx);
+ if (!event)
+ return 0;
+ entry = ring_buffer_event_data(event);
+ entry->graph_ent.func = trace->func;
+ entry->graph_ent.depth = trace->depth;
+ entry->graph_ent.retaddr = retaddr;
+ trace_buffer_unlock_commit_nostack(buffer, event);
+
+ return 1;
+}
+#else
+int __trace_graph_retaddr_entry(struct trace_array *tr,
+ struct ftrace_graph_ent *trace,
+ unsigned int trace_ctx,
+ unsigned long retaddr)
+{
+ return 1;
+}
+#endif
+
static inline int ftrace_graph_ignore_irqs(void)
{
if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT))
@@ -129,17 +169,25 @@ static inline int ftrace_graph_ignore_irqs(void)
return in_hardirq();
}
-int trace_graph_entry(struct ftrace_graph_ent *trace)
+struct fgraph_times {
+ unsigned long long calltime;
+ unsigned long long sleeptime; /* may be optional! */
+};
+
+int trace_graph_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
- struct trace_array *tr = graph_array;
+ unsigned long *task_var = fgraph_get_task_var(gops);
+ struct trace_array *tr = gops->private;
struct trace_array_cpu *data;
- unsigned long flags;
+ struct fgraph_times *ftimes;
unsigned int trace_ctx;
long disabled;
- int ret;
+ int ret = 0;
int cpu;
- if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT))
+ if (*task_var & TRACE_GRAPH_NOTRACE)
return 0;
/*
@@ -150,7 +198,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
* returning from the function.
*/
if (ftrace_graph_notrace_addr(trace->func)) {
- trace_recursion_set(TRACE_GRAPH_NOTRACE_BIT);
+ *task_var |= TRACE_GRAPH_NOTRACE;
/*
* Need to return 1 to have the return called
* that will clear the NOTRACE bit.
@@ -161,12 +209,25 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
if (!ftrace_trace_task(tr))
return 0;
- if (ftrace_graph_ignore_func(trace))
+ if (ftrace_graph_ignore_func(gops, trace))
return 0;
if (ftrace_graph_ignore_irqs())
return 0;
+ if (fgraph_sleep_time) {
+ /* Only need to record the calltime */
+ ftimes = fgraph_reserve_data(gops->idx, sizeof(ftimes->calltime));
+ } else {
+ ftimes = fgraph_reserve_data(gops->idx, sizeof(*ftimes));
+ if (ftimes)
+ ftimes->sleeptime = current->ftrace_sleeptime;
+ }
+ if (!ftimes)
+ return 0;
+
+ ftimes->calltime = trace_clock_local();
+
/*
* Stop here if tracing_threshold is set. We only write function return
* events to the ring buffer.
@@ -174,19 +235,21 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
if (tracing_thresh)
return 1;
- local_irq_save(flags);
+ preempt_disable_notrace();
cpu = raw_smp_processor_id();
data = per_cpu_ptr(tr->array_buffer.data, cpu);
- disabled = atomic_inc_return(&data->disabled);
- if (likely(disabled == 1)) {
- trace_ctx = tracing_gen_ctx_flags(flags);
- ret = __trace_graph_entry(tr, trace, trace_ctx);
- } else {
- ret = 0;
+ disabled = atomic_read(&data->disabled);
+ if (likely(!disabled)) {
+ trace_ctx = tracing_gen_ctx();
+ if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) &&
+ tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR)) {
+ unsigned long retaddr = ftrace_graph_top_ret_addr(current);
+ ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr);
+ } else {
+ ret = __trace_graph_entry(tr, trace, trace_ctx);
+ }
}
-
- atomic_dec(&data->disabled);
- local_irq_restore(flags);
+ preempt_enable_notrace();
return ret;
}
@@ -203,12 +266,10 @@ __trace_graph_function(struct trace_array *tr,
struct ftrace_graph_ret ret = {
.func = ip,
.depth = 0,
- .calltime = time,
- .rettime = time,
};
__trace_graph_entry(tr, &ent, trace_ctx);
- __trace_graph_return(tr, &ret, trace_ctx);
+ __trace_graph_return(tr, &ret, trace_ctx, time, time);
}
void
@@ -220,10 +281,10 @@ trace_graph_function(struct trace_array *tr,
}
void __trace_graph_return(struct trace_array *tr,
- struct ftrace_graph_ret *trace,
- unsigned int trace_ctx)
+ struct ftrace_graph_ret *trace,
+ unsigned int trace_ctx,
+ u64 calltime, u64 rettime)
{
- struct trace_event_call *call = &event_funcgraph_exit;
struct ring_buffer_event *event;
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ftrace_graph_ret_entry *entry;
@@ -234,82 +295,140 @@ void __trace_graph_return(struct trace_array *tr,
return;
entry = ring_buffer_event_data(event);
entry->ret = *trace;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit_nostack(buffer, event);
+ entry->calltime = calltime;
+ entry->rettime = rettime;
+ trace_buffer_unlock_commit_nostack(buffer, event);
+}
+
+static void handle_nosleeptime(struct ftrace_graph_ret *trace,
+ struct fgraph_times *ftimes,
+ int size)
+{
+ if (fgraph_sleep_time || size < sizeof(*ftimes))
+ return;
+
+ ftimes->calltime += current->ftrace_sleeptime - ftimes->sleeptime;
}
-void trace_graph_return(struct ftrace_graph_ret *trace)
+void trace_graph_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops, struct ftrace_regs *fregs)
{
- struct trace_array *tr = graph_array;
+ unsigned long *task_var = fgraph_get_task_var(gops);
+ struct trace_array *tr = gops->private;
struct trace_array_cpu *data;
- unsigned long flags;
+ struct fgraph_times *ftimes;
unsigned int trace_ctx;
+ u64 calltime, rettime;
long disabled;
+ int size;
int cpu;
- ftrace_graph_addr_finish(trace);
+ rettime = trace_clock_local();
- if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) {
- trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT);
+ ftrace_graph_addr_finish(gops, trace);
+
+ if (*task_var & TRACE_GRAPH_NOTRACE) {
+ *task_var &= ~TRACE_GRAPH_NOTRACE;
return;
}
- local_irq_save(flags);
+ ftimes = fgraph_retrieve_data(gops->idx, &size);
+ if (!ftimes)
+ return;
+
+ handle_nosleeptime(trace, ftimes, size);
+
+ calltime = ftimes->calltime;
+
+ preempt_disable_notrace();
cpu = raw_smp_processor_id();
data = per_cpu_ptr(tr->array_buffer.data, cpu);
- disabled = atomic_inc_return(&data->disabled);
- if (likely(disabled == 1)) {
- trace_ctx = tracing_gen_ctx_flags(flags);
- __trace_graph_return(tr, trace, trace_ctx);
+ disabled = atomic_read(&data->disabled);
+ if (likely(!disabled)) {
+ trace_ctx = tracing_gen_ctx();
+ __trace_graph_return(tr, trace, trace_ctx, calltime, rettime);
}
- atomic_dec(&data->disabled);
- local_irq_restore(flags);
+ preempt_enable_notrace();
}
-void set_graph_array(struct trace_array *tr)
+static void trace_graph_thresh_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
- graph_array = tr;
+ struct fgraph_times *ftimes;
+ int size;
- /* Make graph_array visible before we start tracing */
-
- smp_mb();
-}
-
-static void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
-{
- ftrace_graph_addr_finish(trace);
+ ftrace_graph_addr_finish(gops, trace);
if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) {
trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT);
return;
}
+ ftimes = fgraph_retrieve_data(gops->idx, &size);
+ if (!ftimes)
+ return;
+
+ handle_nosleeptime(trace, ftimes, size);
+
if (tracing_thresh &&
- (trace->rettime - trace->calltime < tracing_thresh))
+ (trace_clock_local() - ftimes->calltime < tracing_thresh))
return;
else
- trace_graph_return(trace);
+ trace_graph_return(trace, gops, fregs);
}
-static struct fgraph_ops funcgraph_thresh_ops = {
- .entryfunc = &trace_graph_entry,
- .retfunc = &trace_graph_thresh_return,
-};
-
static struct fgraph_ops funcgraph_ops = {
.entryfunc = &trace_graph_entry,
.retfunc = &trace_graph_return,
};
+int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops)
+{
+ struct fgraph_ops *gops;
+
+ gops = kzalloc(sizeof(*gops), GFP_KERNEL);
+ if (!gops)
+ return -ENOMEM;
+
+ gops->entryfunc = &trace_graph_entry;
+ gops->retfunc = &trace_graph_return;
+
+ tr->gops = gops;
+ gops->private = tr;
+
+ fgraph_init_ops(&gops->ops, ops);
+
+ return 0;
+}
+
+void free_fgraph_ops(struct trace_array *tr)
+{
+ kfree(tr->gops);
+}
+
+__init void init_array_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops)
+{
+ tr->gops = &funcgraph_ops;
+ funcgraph_ops.private = tr;
+ fgraph_init_ops(&tr->gops->ops, ops);
+}
+
static int graph_trace_init(struct trace_array *tr)
{
int ret;
- set_graph_array(tr);
+ tr->gops->entryfunc = trace_graph_entry;
+
if (tracing_thresh)
- ret = register_ftrace_graph(&funcgraph_thresh_ops);
+ tr->gops->retfunc = trace_graph_thresh_return;
else
- ret = register_ftrace_graph(&funcgraph_ops);
+ tr->gops->retfunc = trace_graph_return;
+
+ /* Make gops functions are visible before we start tracing */
+ smp_mb();
+
+ ret = register_ftrace_graph(tr->gops);
if (ret)
return ret;
tracing_start_cmdline_record();
@@ -320,10 +439,7 @@ static int graph_trace_init(struct trace_array *tr)
static void graph_trace_reset(struct trace_array *tr)
{
tracing_stop_cmdline_record();
- if (tracing_thresh)
- unregister_ftrace_graph(&funcgraph_thresh_ops);
- else
- unregister_ftrace_graph(&funcgraph_ops);
+ unregister_ftrace_graph(tr->gops);
}
static int graph_trace_update_thresh(struct trace_array *tr)
@@ -434,7 +550,7 @@ get_return_for_leaf(struct trace_iterator *iter,
* then we just reuse the data from before.
*/
if (data && data->failed) {
- curr = &data->ent;
+ curr = &data->ent.ent;
next = &data->ret;
} else {
@@ -464,7 +580,10 @@ get_return_for_leaf(struct trace_iterator *iter,
* Save current and next entries for later reference
* if the output fails.
*/
- data->ent = *curr;
+ if (unlikely(curr->ent.type == TRACE_GRAPH_RETADDR_ENT))
+ data->ent.rent = *(struct fgraph_retaddr_ent_entry *)curr;
+ else
+ data->ent.ent = *curr;
/*
* If the next event is not a return type, then
* we only care about what type it is. Otherwise we can
@@ -521,6 +640,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
struct trace_seq *s = &iter->seq;
struct trace_entry *ent = iter->ent;
+ addr += iter->tr->text_delta;
+
if (addr < (unsigned long)__irqentry_text_start ||
addr >= (unsigned long)__irqentry_text_end)
return;
@@ -626,52 +747,96 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,
}
#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
-
#define __TRACE_GRAPH_PRINT_RETVAL TRACE_GRAPH_PRINT_RETVAL
+#else
+#define __TRACE_GRAPH_PRINT_RETVAL 0
+#endif
-static void print_graph_retval(struct trace_seq *s, unsigned long retval,
- bool leaf, void *func, bool hex_format)
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+#define __TRACE_GRAPH_PRINT_RETADDR TRACE_GRAPH_PRINT_RETADDR
+static void print_graph_retaddr(struct trace_seq *s, struct fgraph_retaddr_ent_entry *entry,
+ u32 trace_flags, bool comment)
+{
+ if (comment)
+ trace_seq_puts(s, " /*");
+
+ trace_seq_puts(s, " <-");
+ seq_print_ip_sym(s, entry->graph_ent.retaddr, trace_flags | TRACE_ITER_SYM_OFFSET);
+
+ if (comment)
+ trace_seq_puts(s, " */");
+}
+#else
+#define __TRACE_GRAPH_PRINT_RETADDR 0
+#define print_graph_retaddr(_seq, _entry, _tflags, _comment) do { } while (0)
+#endif
+
+#if defined(CONFIG_FUNCTION_GRAPH_RETVAL) || defined(CONFIG_FUNCTION_GRAPH_RETADDR)
+
+static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entry *entry,
+ struct ftrace_graph_ret *graph_ret, void *func,
+ u32 opt_flags, u32 trace_flags)
{
unsigned long err_code = 0;
+ unsigned long retval = 0;
+ bool print_retaddr = false;
+ bool print_retval = false;
+ bool hex_format = !!(opt_flags & TRACE_GRAPH_PRINT_RETVAL_HEX);
+
+#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
+ retval = graph_ret->retval;
+ print_retval = !!(opt_flags & TRACE_GRAPH_PRINT_RETVAL);
+#endif
- if (retval == 0 || hex_format)
- goto done;
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+ print_retaddr = !!(opt_flags & TRACE_GRAPH_PRINT_RETADDR);
+#endif
- /* Check if the return value matches the negative format */
- if (IS_ENABLED(CONFIG_64BIT) && (retval & BIT(31)) &&
- (((u64)retval) >> 32) == 0) {
- /* sign extension */
- err_code = (unsigned long)(s32)retval;
- } else {
- err_code = retval;
+ if (print_retval && retval && !hex_format) {
+ /* Check if the return value matches the negative format */
+ if (IS_ENABLED(CONFIG_64BIT) && (retval & BIT(31)) &&
+ (((u64)retval) >> 32) == 0) {
+ err_code = sign_extend64(retval, 31);
+ } else {
+ err_code = retval;
+ }
+
+ if (!IS_ERR_VALUE(err_code))
+ err_code = 0;
}
- if (!IS_ERR_VALUE(err_code))
- err_code = 0;
+ if (entry) {
+ if (entry->ent.type != TRACE_GRAPH_RETADDR_ENT)
+ print_retaddr = false;
-done:
- if (leaf) {
- if (hex_format || (err_code == 0))
- trace_seq_printf(s, "%ps(); /* = 0x%lx */\n",
- func, retval);
+ trace_seq_printf(s, "%ps();", func);
+ if (print_retval || print_retaddr)
+ trace_seq_puts(s, " /*");
else
- trace_seq_printf(s, "%ps(); /* = %ld */\n",
- func, err_code);
+ trace_seq_putc(s, '\n');
} else {
+ print_retaddr = false;
+ trace_seq_printf(s, "} /* %ps", func);
+ }
+
+ if (print_retaddr)
+ print_graph_retaddr(s, (struct fgraph_retaddr_ent_entry *)entry,
+ trace_flags, false);
+
+ if (print_retval) {
if (hex_format || (err_code == 0))
- trace_seq_printf(s, "} /* %ps = 0x%lx */\n",
- func, retval);
+ trace_seq_printf(s, " ret=0x%lx", retval);
else
- trace_seq_printf(s, "} /* %ps = %ld */\n",
- func, err_code);
+ trace_seq_printf(s, " ret=%ld", err_code);
}
+
+ if (!entry || print_retval || print_retaddr)
+ trace_seq_puts(s, " */\n");
}
#else
-#define __TRACE_GRAPH_PRINT_RETVAL 0
-
-#define print_graph_retval(_seq, _retval, _leaf, _func, _format) do {} while (0)
+#define print_graph_retval(_seq, _ent, _ret, _func, _opt_flags, _trace_flags) do {} while (0)
#endif
@@ -687,12 +852,15 @@ print_graph_entry_leaf(struct trace_iterator *iter,
struct ftrace_graph_ret *graph_ret;
struct ftrace_graph_ent *call;
unsigned long long duration;
+ unsigned long func;
int cpu = iter->cpu;
int i;
graph_ret = &ret_entry->ret;
call = &entry->graph_ent;
- duration = graph_ret->rettime - graph_ret->calltime;
+ duration = ret_entry->rettime - ret_entry->calltime;
+
+ func = call->func + iter->tr->text_delta;
if (data) {
struct fgraph_cpu_data *cpu_data;
@@ -720,14 +888,15 @@ print_graph_entry_leaf(struct trace_iterator *iter,
trace_seq_putc(s, ' ');
/*
- * Write out the function return value if the option function-retval is
- * enabled.
+ * Write out the function return value or return address
*/
- if (flags & __TRACE_GRAPH_PRINT_RETVAL)
- print_graph_retval(s, graph_ret->retval, true, (void *)call->func,
- !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX));
- else
- trace_seq_printf(s, "%ps();\n", (void *)call->func);
+ if (flags & (__TRACE_GRAPH_PRINT_RETVAL | __TRACE_GRAPH_PRINT_RETADDR)) {
+ print_graph_retval(s, entry, graph_ret,
+ (void *)graph_ret->func + iter->tr->text_delta,
+ flags, tr->trace_flags);
+ } else {
+ trace_seq_printf(s, "%ps();\n", (void *)func);
+ }
print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET,
cpu, iter->ent->pid, flags);
@@ -743,6 +912,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
struct ftrace_graph_ent *call = &entry->graph_ent;
struct fgraph_data *data = iter->private;
struct trace_array *tr = iter->tr;
+ unsigned long func;
int i;
if (data) {
@@ -765,7 +935,14 @@ print_graph_entry_nested(struct trace_iterator *iter,
for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
trace_seq_putc(s, ' ');
- trace_seq_printf(s, "%ps() {\n", (void *)call->func);
+ func = call->func + iter->tr->text_delta;
+
+ trace_seq_printf(s, "%ps() {", (void *)func);
+ if (flags & __TRACE_GRAPH_PRINT_RETADDR &&
+ entry->ent.type == TRACE_GRAPH_RETADDR_ENT)
+ print_graph_retaddr(s, (struct fgraph_retaddr_ent_entry *)entry,
+ tr->trace_flags, true);
+ trace_seq_putc(s, '\n');
if (trace_seq_has_overflowed(s))
return TRACE_TYPE_PARTIAL_LINE;
@@ -840,6 +1017,8 @@ check_irq_entry(struct trace_iterator *iter, u32 flags,
int *depth_irq;
struct fgraph_data *data = iter->private;
+ addr += iter->tr->text_delta;
+
/*
* If we are either displaying irqs, or we got called as
* a graph event and private data does not exist,
@@ -960,18 +1139,24 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
}
static enum print_line_t
-print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
+print_graph_return(struct ftrace_graph_ret_entry *retentry, struct trace_seq *s,
struct trace_entry *ent, struct trace_iterator *iter,
u32 flags)
{
- unsigned long long duration = trace->rettime - trace->calltime;
+ struct ftrace_graph_ret *trace = &retentry->ret;
+ u64 calltime = retentry->calltime;
+ u64 rettime = retentry->rettime;
+ unsigned long long duration = rettime - calltime;
struct fgraph_data *data = iter->private;
struct trace_array *tr = iter->tr;
+ unsigned long func;
pid_t pid = ent->pid;
int cpu = iter->cpu;
int func_match = 1;
int i;
+ func = trace->func + iter->tr->text_delta;
+
if (check_irq_return(iter, flags, trace->depth))
return TRACE_TYPE_HANDLED;
@@ -1007,11 +1192,10 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
/*
* Always write out the function name and its return value if the
- * function-retval option is enabled.
+ * funcgraph-retval option is enabled.
*/
if (flags & __TRACE_GRAPH_PRINT_RETVAL) {
- print_graph_retval(s, trace->retval, false, (void *)trace->func,
- !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX));
+ print_graph_retval(s, NULL, trace, (void *)func, flags, tr->trace_flags);
} else {
/*
* If the return function does not have a matching entry,
@@ -1023,7 +1207,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL))
trace_seq_puts(s, "}\n");
else
- trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
+ trace_seq_printf(s, "} /* %ps */\n", (void *)func);
}
/* Overrun */
@@ -1126,7 +1310,7 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
* to print out the missing entry which would never go out.
*/
if (data && data->failed) {
- field = &data->ent;
+ field = &data->ent.ent;
iter->cpu = data->cpu;
ret = print_graph_entry(field, s, iter, flags);
if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
@@ -1150,10 +1334,20 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
saved = *field;
return print_graph_entry(&saved, s, iter, flags);
}
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+ case TRACE_GRAPH_RETADDR_ENT: {
+ struct fgraph_retaddr_ent_entry saved;
+ struct fgraph_retaddr_ent_entry *rfield;
+
+ trace_assign_type(rfield, entry);
+ saved = *rfield;
+ return print_graph_entry((struct ftrace_graph_ent_entry *)&saved, s, iter, flags);
+ }
+#endif
case TRACE_GRAPH_RET: {
struct ftrace_graph_ret_entry *field;
trace_assign_type(field, entry);
- return print_graph_return(&field->ret, s, entry, iter, flags);
+ return print_graph_return(field, s, entry, iter, flags);
}
case TRACE_STACK:
case TRACE_FN:
@@ -1344,6 +1538,13 @@ static struct trace_event graph_trace_entry_event = {
.funcs = &graph_functions,
};
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+static struct trace_event graph_trace_retaddr_entry_event = {
+ .type = TRACE_GRAPH_RETADDR_ENT,
+ .funcs = &graph_functions,
+};
+#endif
+
static struct trace_event graph_trace_ret_event = {
.type = TRACE_GRAPH_RET,
.funcs = &graph_functions
@@ -1362,6 +1563,7 @@ static struct tracer graph_trace __tracer_data = {
.print_header = print_graph_headers,
.flags = &tracer_flags,
.set_flag = func_graph_set_flag,
+ .allow_instances = true,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_function_graph,
#endif
@@ -1429,6 +1631,13 @@ static __init int init_graph_trace(void)
return 1;
}
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+ if (!register_trace_event(&graph_trace_retaddr_entry_event)) {
+ pr_warn("Warning: could not register graph trace retaddr events\n");
+ return 1;
+ }
+#endif
+
if (!register_trace_event(&graph_trace_ret_event)) {
pr_warn("Warning: could not register graph trace events\n");
return 1;
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index b791524a6536..b65353ec2837 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -130,7 +130,6 @@ static bool hwlat_busy;
static void trace_hwlat_sample(struct hwlat_sample *sample)
{
struct trace_array *tr = hwlat_trace;
- struct trace_event_call *call = &event_hwlat;
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct hwlat_entry *entry;
@@ -148,8 +147,7 @@ static void trace_hwlat_sample(struct hwlat_sample *sample)
entry->nmi_count = sample->nmi_count;
entry->count = sample->count;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit_nostack(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
}
/* Macros to encapsulate the time capturing infrastructure */
@@ -520,6 +518,8 @@ static void hwlat_hotplug_workfn(struct work_struct *dummy)
if (!hwlat_busy || hwlat_data.thread_mode != MODE_PER_CPU)
goto out_unlock;
+ if (!cpu_online(cpu))
+ goto out_unlock;
if (!cpumask_test_cpu(cpu, tr->tracing_cpumask))
goto out_unlock;
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index ba37f768e2f2..7294ad676379 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -175,15 +175,18 @@ static int irqsoff_display_graph(struct trace_array *tr, int set)
return start_irqsoff_tracer(irqsoff_trace, set);
}
-static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
+static int irqsoff_graph_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
unsigned long flags;
unsigned int trace_ctx;
+ u64 *calltime;
int ret;
- if (ftrace_graph_ignore_func(trace))
+ if (ftrace_graph_ignore_func(gops, trace))
return 0;
/*
* Do not trace a function if it's filtered by set_graph_notrace.
@@ -198,6 +201,12 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
if (!func_prolog_dec(tr, &data, &flags))
return 0;
+ calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime));
+ if (!calltime)
+ return 0;
+
+ *calltime = trace_clock_local();
+
trace_ctx = tracing_gen_ctx_flags(flags);
ret = __trace_graph_entry(tr, trace, trace_ctx);
atomic_dec(&data->disabled);
@@ -205,20 +214,30 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
return ret;
}
-static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
+static void irqsoff_graph_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
unsigned long flags;
unsigned int trace_ctx;
+ u64 *calltime;
+ u64 rettime;
+ int size;
- ftrace_graph_addr_finish(trace);
+ ftrace_graph_addr_finish(gops, trace);
if (!func_prolog_dec(tr, &data, &flags))
return;
+ rettime = trace_clock_local();
+ calltime = fgraph_retrieve_data(gops->idx, &size);
+ if (!calltime)
+ return;
+
trace_ctx = tracing_gen_ctx_flags(flags);
- __trace_graph_return(tr, trace, trace_ctx);
+ __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime);
atomic_dec(&data->disabled);
}
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 59857a1ee44c..1e72d20b3c2f 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -96,22 +96,19 @@ static int kdb_ftdump(int argc, const char **argv)
{
int skip_entries = 0;
long cpu_file;
- char *cp;
+ int err;
int cnt;
int cpu;
if (argc > 2)
return KDB_ARGCOUNT;
- if (argc) {
- skip_entries = simple_strtol(argv[1], &cp, 0);
- if (*cp)
- skip_entries = 0;
- }
+ if (argc && kstrtoint(argv[1], 0, &skip_entries))
+ return KDB_BADINT;
if (argc == 2) {
- cpu_file = simple_strtol(argv[2], &cp, 0);
- if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 ||
+ err = kstrtol(argv[2], 0, &cpu_file);
+ if (err || cpu_file >= NR_CPUS || cpu_file < 0 ||
!cpu_online(cpu_file))
return KDB_BADINT;
} else {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index c4c6e0e0068b..d8d5f18a141a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -8,6 +8,7 @@
#define pr_fmt(fmt) "trace_kprobe: " fmt
#include <linux/bpf-cgroup.h>
+#include <linux/cleanup.h>
#include <linux/security.h>
#include <linux/module.h>
#include <linux/uaccess.h>
@@ -111,6 +112,7 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
return strncmp(module_name(mod), name, len) == 0 && name[len] == ':';
}
+#ifdef CONFIG_MODULES
static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
{
char *p;
@@ -129,6 +131,12 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
return ret;
}
+#else
+static inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
+{
+ return false;
+}
+#endif
static bool trace_kprobe_is_busy(struct dyn_event *ev)
{
@@ -250,6 +258,9 @@ static void free_trace_kprobe(struct trace_kprobe *tk)
}
}
+DEFINE_FREE(free_trace_kprobe, struct trace_kprobe *,
+ if (!IS_ERR_OR_NULL(_T)) free_trace_kprobe(_T))
+
/*
* Allocate new trace_probe and initialize it (including kprobes).
*/
@@ -261,7 +272,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
int maxactive,
int nargs, bool is_return)
{
- struct trace_kprobe *tk;
+ struct trace_kprobe *tk __free(free_trace_kprobe) = NULL;
int ret = -ENOMEM;
tk = kzalloc(struct_size(tk, tp.args, nargs), GFP_KERNEL);
@@ -270,12 +281,12 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
tk->nhit = alloc_percpu(unsigned long);
if (!tk->nhit)
- goto error;
+ return ERR_PTR(ret);
if (symbol) {
tk->symbol = kstrdup(symbol, GFP_KERNEL);
if (!tk->symbol)
- goto error;
+ return ERR_PTR(ret);
tk->rp.kp.symbol_name = tk->symbol;
tk->rp.kp.offset = offs;
} else
@@ -290,15 +301,12 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
INIT_HLIST_NODE(&tk->rp.kp.hlist);
INIT_LIST_HEAD(&tk->rp.kp.list);
- ret = trace_probe_init(&tk->tp, event, group, false);
+ ret = trace_probe_init(&tk->tp, event, group, false, nargs);
if (ret < 0)
- goto error;
+ return ERR_PTR(ret);
dyn_event_init(&tk->devent, &trace_kprobe_ops);
- return tk;
-error:
- free_trace_kprobe(tk);
- return ERR_PTR(ret);
+ return_ptr(tk);
}
static struct trace_kprobe *find_trace_kprobe(const char *event,
@@ -627,7 +635,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
struct trace_kprobe *old_tk;
int ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
old_tk = find_trace_kprobe(trace_probe_name(&tk->tp),
trace_probe_group_name(&tk->tp));
@@ -635,11 +643,9 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
if (trace_kprobe_is_return(tk) != trace_kprobe_is_return(old_tk)) {
trace_probe_log_set_index(0);
trace_probe_log_err(0, DIFF_PROBE_TYPE);
- ret = -EEXIST;
- } else {
- ret = append_trace_kprobe(tk, old_tk);
+ return -EEXIST;
}
- goto end;
+ return append_trace_kprobe(tk, old_tk);
}
/* Register new event */
@@ -650,7 +656,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
trace_probe_log_err(0, EVENT_EXIST);
} else
pr_warn("Failed to register probe event(%d)\n", ret);
- goto end;
+ return ret;
}
/* Register k*probe */
@@ -665,8 +671,22 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
else
dyn_event_add(&tk->devent, trace_probe_event_call(&tk->tp));
-end:
- mutex_unlock(&event_mutex);
+ return ret;
+}
+
+#ifdef CONFIG_MODULES
+static int validate_module_probe_symbol(const char *modname, const char *symbol);
+
+static int register_module_trace_kprobe(struct module *mod, struct trace_kprobe *tk)
+{
+ const char *p;
+ int ret = 0;
+
+ p = strchr(trace_kprobe_symbol(tk), ':');
+ if (p)
+ ret = validate_module_probe_symbol(module_name(mod), p + 1);
+ if (!ret)
+ ret = __register_trace_kprobe(tk);
return ret;
}
@@ -683,27 +703,36 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
return NOTIFY_DONE;
/* Update probes on coming module */
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
for_each_trace_kprobe(tk, pos) {
if (trace_kprobe_within_module(tk, mod)) {
/* Don't need to check busy - this should have gone. */
__unregister_trace_kprobe(tk);
- ret = __register_trace_kprobe(tk);
+ ret = register_module_trace_kprobe(mod, tk);
if (ret)
pr_warn("Failed to re-register probe %s on %s: %d\n",
trace_probe_name(&tk->tp),
module_name(mod), ret);
}
}
- mutex_unlock(&event_mutex);
return NOTIFY_DONE;
}
static struct notifier_block trace_kprobe_module_nb = {
.notifier_call = trace_kprobe_module_callback,
- .priority = 1 /* Invoked after kprobe module callback */
+ .priority = 2 /* Invoked after kprobe and jump_label module callback */
};
+static int trace_kprobe_register_module_notifier(void)
+{
+ return register_module_notifier(&trace_kprobe_module_nb);
+}
+#else
+static int trace_kprobe_register_module_notifier(void)
+{
+ return 0;
+}
+#endif /* CONFIG_MODULES */
static int count_symbols(void *data, unsigned long unused)
{
@@ -729,18 +758,86 @@ static int count_mod_symbols(void *data, const char *name, unsigned long unused)
return 0;
}
-static unsigned int number_of_same_symbols(char *func_name)
+static unsigned int number_of_same_symbols(const char *mod, const char *func_name)
{
struct sym_count_ctx ctx = { .count = 0, .name = func_name };
- kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count);
+ if (!mod)
+ kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count);
- module_kallsyms_on_each_symbol(NULL, count_mod_symbols, &ctx);
+ module_kallsyms_on_each_symbol(mod, count_mod_symbols, &ctx);
return ctx.count;
}
-static int __trace_kprobe_create(int argc, const char *argv[])
+static int validate_module_probe_symbol(const char *modname, const char *symbol)
+{
+ unsigned int count = number_of_same_symbols(modname, symbol);
+
+ if (count > 1) {
+ /*
+ * Users should use ADDR to remove the ambiguity of
+ * using KSYM only.
+ */
+ return -EADDRNOTAVAIL;
+ } else if (count == 0) {
+ /*
+ * We can return ENOENT earlier than when register the
+ * kprobe.
+ */
+ return -ENOENT;
+ }
+ return 0;
+}
+
+#ifdef CONFIG_MODULES
+/* Return NULL if the module is not loaded or under unloading. */
+static struct module *try_module_get_by_name(const char *name)
+{
+ struct module *mod;
+
+ rcu_read_lock_sched();
+ mod = find_module(name);
+ if (mod && !try_module_get(mod))
+ mod = NULL;
+ rcu_read_unlock_sched();
+
+ return mod;
+}
+#else
+#define try_module_get_by_name(name) (NULL)
+#endif
+
+static int validate_probe_symbol(char *symbol)
+{
+ struct module *mod = NULL;
+ char *modname = NULL, *p;
+ int ret = 0;
+
+ p = strchr(symbol, ':');
+ if (p) {
+ modname = symbol;
+ symbol = p + 1;
+ *p = '\0';
+ mod = try_module_get_by_name(modname);
+ if (!mod)
+ goto out;
+ }
+
+ ret = validate_module_probe_symbol(modname, symbol);
+out:
+ if (p)
+ *p = ':';
+ if (mod)
+ module_put(mod);
+ return ret;
+}
+
+static int trace_kprobe_entry_handler(struct kretprobe_instance *ri,
+ struct pt_regs *regs);
+
+static int trace_kprobe_create_internal(int argc, const char *argv[],
+ struct traceprobe_parse_context *ctx)
{
/*
* Argument syntax:
@@ -766,11 +863,12 @@ static int __trace_kprobe_create(int argc, const char *argv[])
* Type of args:
* FETCHARG:TYPE : use TYPE instead of unsigned long.
*/
- struct trace_kprobe *tk = NULL;
+ struct trace_kprobe *tk __free(free_trace_kprobe) = NULL;
int i, len, new_argc = 0, ret = 0;
bool is_return = false;
- char *symbol = NULL, *tmp = NULL;
- const char **new_argv = NULL;
+ char *symbol __free(kfree) = NULL;
+ char *tmp = NULL;
+ const char **new_argv __free(kfree) = NULL;
const char *event = NULL, *group = KPROBE_EVENT_SYSTEM;
enum probe_print_type ptype;
int maxactive = 0;
@@ -779,7 +877,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
char buf[MAX_EVENT_NAME_LEN];
char gbuf[MAX_EVENT_NAME_LEN];
char abuf[MAX_BTF_ARGS_LEN];
- struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL };
+ char *dbuf __free(kfree) = NULL;
switch (argv[0][0]) {
case 'r':
@@ -793,8 +891,6 @@ static int __trace_kprobe_create(int argc, const char *argv[])
if (argc < 2)
return -ECANCELED;
- trace_probe_log_init("trace_kprobe", argc, argv);
-
event = strchr(&argv[0][1], ':');
if (event)
event++;
@@ -802,7 +898,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
if (isdigit(argv[0][1])) {
if (!is_return) {
trace_probe_log_err(1, BAD_MAXACT_TYPE);
- goto parse_error;
+ return -EINVAL;
}
if (event)
len = event - &argv[0][1] - 1;
@@ -810,21 +906,21 @@ static int __trace_kprobe_create(int argc, const char *argv[])
len = strlen(&argv[0][1]);
if (len > MAX_EVENT_NAME_LEN - 1) {
trace_probe_log_err(1, BAD_MAXACT);
- goto parse_error;
+ return -EINVAL;
}
memcpy(buf, &argv[0][1], len);
buf[len] = '\0';
ret = kstrtouint(buf, 0, &maxactive);
if (ret || !maxactive) {
trace_probe_log_err(1, BAD_MAXACT);
- goto parse_error;
+ return -EINVAL;
}
/* kretprobes instances are iterated over via a list. The
* maximum should stay reasonable.
*/
if (maxactive > KRETPROBE_MAXACTIVE_MAX) {
trace_probe_log_err(1, MAXACT_TOO_BIG);
- goto parse_error;
+ return -EINVAL;
}
}
@@ -833,10 +929,9 @@ static int __trace_kprobe_create(int argc, const char *argv[])
if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) {
trace_probe_log_set_index(1);
/* Check whether uprobe event specified */
- if (strchr(argv[1], '/') && strchr(argv[1], ':')) {
- ret = -ECANCELED;
- goto error;
- }
+ if (strchr(argv[1], '/') && strchr(argv[1], ':'))
+ return -ECANCELED;
+
/* a symbol specified */
symbol = kstrdup(argv[1], GFP_KERNEL);
if (!symbol)
@@ -849,7 +944,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
is_return = true;
} else {
trace_probe_log_err(tmp - symbol, BAD_ADDR_SUFFIX);
- goto parse_error;
+ return -EINVAL;
}
}
@@ -857,42 +952,25 @@ static int __trace_kprobe_create(int argc, const char *argv[])
ret = traceprobe_split_symbol_offset(symbol, &offset);
if (ret || offset < 0 || offset > UINT_MAX) {
trace_probe_log_err(0, BAD_PROBE_ADDR);
- goto parse_error;
+ return -EINVAL;
+ }
+ ret = validate_probe_symbol(symbol);
+ if (ret) {
+ if (ret == -EADDRNOTAVAIL)
+ trace_probe_log_err(0, NON_UNIQ_SYMBOL);
+ else
+ trace_probe_log_err(0, BAD_PROBE_ADDR);
+ return -EINVAL;
}
if (is_return)
- ctx.flags |= TPARG_FL_RETURN;
+ ctx->flags |= TPARG_FL_RETURN;
ret = kprobe_on_func_entry(NULL, symbol, offset);
if (ret == 0 && !is_return)
- ctx.flags |= TPARG_FL_FENTRY;
+ ctx->flags |= TPARG_FL_FENTRY;
/* Defer the ENOENT case until register kprobe */
if (ret == -EINVAL && is_return) {
trace_probe_log_err(0, BAD_RETPROBE);
- goto parse_error;
- }
- }
-
- if (symbol && !strchr(symbol, ':')) {
- unsigned int count;
-
- count = number_of_same_symbols(symbol);
- if (count > 1) {
- /*
- * Users should use ADDR to remove the ambiguity of
- * using KSYM only.
- */
- trace_probe_log_err(0, NON_UNIQ_SYMBOL);
- ret = -EADDRNOTAVAIL;
-
- goto error;
- } else if (count == 0) {
- /*
- * We can return ENOENT earlier than when register the
- * kprobe.
- */
- trace_probe_log_err(0, BAD_PROBE_ADDR);
- ret = -ENOENT;
-
- goto error;
+ return -EINVAL;
}
}
@@ -901,7 +979,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
if (ret)
- goto parse_error;
+ return ret;
}
if (!event) {
@@ -917,18 +995,24 @@ static int __trace_kprobe_create(int argc, const char *argv[])
}
argc -= 2; argv += 2;
- ctx.funcname = symbol;
+ ctx->funcname = symbol;
new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc,
- abuf, MAX_BTF_ARGS_LEN, &ctx);
+ abuf, MAX_BTF_ARGS_LEN, ctx);
if (IS_ERR(new_argv)) {
ret = PTR_ERR(new_argv);
new_argv = NULL;
- goto out;
+ return ret;
}
if (new_argv) {
argc = new_argc;
argv = new_argv;
}
+ if (argc > MAX_TRACE_ARGS)
+ return -E2BIG;
+
+ ret = traceprobe_expand_dentry_args(argc, argv, &dbuf);
+ if (ret)
+ return ret;
/* setup a probe */
tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive,
@@ -937,22 +1021,27 @@ static int __trace_kprobe_create(int argc, const char *argv[])
ret = PTR_ERR(tk);
/* This must return -ENOMEM, else there is a bug */
WARN_ON_ONCE(ret != -ENOMEM);
- goto out; /* We know tk is not allocated */
+ return ret; /* We know tk is not allocated */
}
/* parse arguments */
- for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ for (i = 0; i < argc; i++) {
trace_probe_log_set_index(i + 2);
- ctx.offset = 0;
- ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], &ctx);
+ ctx->offset = 0;
+ ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], ctx);
if (ret)
- goto error; /* This can be -ENOMEM */
+ return ret; /* This can be -ENOMEM */
+ }
+ /* entry handler for kretprobe */
+ if (is_return && tk->tp.entry_arg) {
+ tk->rp.entry_handler = trace_kprobe_entry_handler;
+ tk->rp.data_size = traceprobe_get_entry_data_size(&tk->tp);
}
ptype = is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
ret = traceprobe_set_print_fmt(&tk->tp, ptype);
if (ret < 0)
- goto error;
+ return ret;
ret = register_trace_kprobe(tk);
if (ret) {
@@ -963,26 +1052,34 @@ static int __trace_kprobe_create(int argc, const char *argv[])
trace_probe_log_err(0, BAD_PROBE_ADDR);
else if (ret != -ENOMEM && ret != -EEXIST)
trace_probe_log_err(0, FAIL_REG_PROBE);
- goto error;
+ return ret;
}
+ /*
+ * Here, 'tk' has been registered to the list successfully,
+ * so we don't need to free it.
+ */
+ tk = NULL;
+
+ return 0;
+}
+
+static int trace_kprobe_create_cb(int argc, const char *argv[])
+{
+ struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL };
+ int ret;
+
+ trace_probe_log_init("trace_kprobe", argc, argv);
+
+ ret = trace_kprobe_create_internal(argc, argv, &ctx);
-out:
traceprobe_finish_parse(&ctx);
trace_probe_log_clear();
- kfree(new_argv);
- kfree(symbol);
return ret;
-
-parse_error:
- ret = -EINVAL;
-error:
- free_trace_kprobe(tk);
- goto out;
}
static int trace_kprobe_create(const char *raw_command)
{
- return trace_probe_create(raw_command, __trace_kprobe_create);
+ return trace_probe_create(raw_command, trace_kprobe_create_cb);
}
static int create_or_delete_trace_kprobe(const char *raw_command)
@@ -1303,8 +1400,8 @@ static const struct file_operations kprobe_profile_ops = {
/* Note that we don't verify it, since the code does not come from user space */
static int
-process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
- void *base)
+process_fetch_insn(struct fetch_insn *code, void *rec, void *edata,
+ void *dest, void *base)
{
struct pt_regs *regs = rec;
unsigned long val;
@@ -1329,6 +1426,9 @@ retry:
case FETCH_OP_ARG:
val = regs_get_kernel_argument(regs, code->param);
break;
+ case FETCH_OP_EDATA:
+ val = *(unsigned long *)((unsigned long)edata + code->offset);
+ break;
#endif
case FETCH_NOP_SYMBOL: /* Ignore a place holder */
code++;
@@ -1359,7 +1459,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
if (trace_trigger_soft_disabled(trace_file))
return;
- dsize = __get_data_size(&tk->tp, regs);
+ dsize = __get_data_size(&tk->tp, regs, NULL);
entry = trace_event_buffer_reserve(&fbuffer, trace_file,
sizeof(*entry) + tk->tp.size + dsize);
@@ -1368,7 +1468,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
fbuffer.regs = regs;
entry->ip = (unsigned long)tk->rp.kp.addr;
- store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tk->tp, regs, NULL, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
}
@@ -1384,6 +1484,31 @@ kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs)
NOKPROBE_SYMBOL(kprobe_trace_func);
/* Kretprobe handler */
+
+static int trace_kprobe_entry_handler(struct kretprobe_instance *ri,
+ struct pt_regs *regs)
+{
+ struct kretprobe *rp = get_kretprobe(ri);
+ struct trace_kprobe *tk;
+
+ /*
+ * There is a small chance that get_kretprobe(ri) returns NULL when
+ * the kretprobe is unregister on another CPU between kretprobe's
+ * trampoline_handler and this function.
+ */
+ if (unlikely(!rp))
+ return -ENOENT;
+
+ tk = container_of(rp, struct trace_kprobe, rp);
+
+ /* store argument values into ri->data as entry data */
+ if (tk->tp.entry_arg)
+ store_trace_entry_data(ri->data, &tk->tp, regs);
+
+ return 0;
+}
+
+
static nokprobe_inline void
__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs,
@@ -1399,7 +1524,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
if (trace_trigger_soft_disabled(trace_file))
return;
- dsize = __get_data_size(&tk->tp, regs);
+ dsize = __get_data_size(&tk->tp, regs, ri->data);
entry = trace_event_buffer_reserve(&fbuffer, trace_file,
sizeof(*entry) + tk->tp.size + dsize);
@@ -1409,7 +1534,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
fbuffer.regs = regs;
entry->func = (unsigned long)tk->rp.kp.addr;
entry->ret_ip = get_kretprobe_retaddr(ri);
- store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tk->tp, regs, ri->data, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
}
@@ -1557,7 +1682,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
if (hlist_empty(head))
return 0;
- dsize = __get_data_size(&tk->tp, regs);
+ dsize = __get_data_size(&tk->tp, regs, NULL);
__size = sizeof(*entry) + tk->tp.size + dsize;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
@@ -1568,7 +1693,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
entry->ip = (unsigned long)tk->rp.kp.addr;
memset(&entry[1], 0, dsize);
- store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tk->tp, regs, NULL, sizeof(*entry), dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
return 0;
@@ -1593,7 +1718,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
if (hlist_empty(head))
return;
- dsize = __get_data_size(&tk->tp, regs);
+ dsize = __get_data_size(&tk->tp, regs, ri->data);
__size = sizeof(*entry) + tk->tp.size + dsize;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
@@ -1604,7 +1729,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
entry->func = (unsigned long)tk->rp.kp.addr;
entry->ret_ip = get_kretprobe_retaddr(ri);
- store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tk->tp, regs, ri->data, sizeof(*entry), dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
}
@@ -1770,26 +1895,14 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
bool is_return)
{
enum probe_print_type ptype;
- struct trace_kprobe *tk;
+ struct trace_kprobe *tk __free(free_trace_kprobe) = NULL;
int ret;
char *event;
if (func) {
- unsigned int count;
-
- count = number_of_same_symbols(func);
- if (count > 1)
- /*
- * Users should use addr to remove the ambiguity of
- * using func only.
- */
- return ERR_PTR(-EADDRNOTAVAIL);
- else if (count == 0)
- /*
- * We can return ENOENT earlier than when register the
- * kprobe.
- */
- return ERR_PTR(-ENOENT);
+ ret = validate_probe_symbol(func);
+ if (ret)
+ return ERR_PTR(ret);
}
/*
@@ -1813,19 +1926,14 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
ptype = trace_kprobe_is_return(tk) ?
PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
- if (traceprobe_set_print_fmt(&tk->tp, ptype) < 0) {
- ret = -ENOMEM;
- goto error;
- }
+ if (traceprobe_set_print_fmt(&tk->tp, ptype) < 0)
+ return ERR_PTR(-ENOMEM);
ret = __register_trace_kprobe(tk);
if (ret < 0)
- goto error;
+ return ERR_PTR(ret);
- return trace_probe_event_call(&tk->tp);
-error:
- free_trace_kprobe(tk);
- return ERR_PTR(ret);
+ return trace_probe_event_call(&(no_free_ptr(tk)->tp));
}
void destroy_local_trace_kprobe(struct trace_event_call *event_call)
@@ -1854,13 +1962,12 @@ static __init void enable_boot_kprobe_events(void)
struct trace_kprobe *tk;
struct dyn_event *pos;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
for_each_trace_kprobe(tk, pos) {
list_for_each_entry(file, &tr->events, list)
if (file->event_call == trace_probe_event_call(&tk->tp))
trace_event_enable_disable(file, 1, 0);
}
- mutex_unlock(&event_mutex);
}
static __init void setup_boot_kprobe_events(void)
@@ -1897,7 +2004,7 @@ static __init int init_kprobe_trace_early(void)
if (ret)
return ret;
- if (register_module_notifier(&trace_kprobe_module_nb))
+ if (trace_kprobe_register_module_notifier())
return -EINVAL;
return 0;
@@ -1963,19 +2070,16 @@ static __init int kprobe_trace_self_tests_init(void)
pr_info("Testing kprobe tracing: ");
ret = create_or_delete_trace_kprobe("p:testprobe kprobe_trace_selftest_target $stack $stack0 +0($stack)");
- if (WARN_ON_ONCE(ret)) {
- pr_warn("error on probing function entry.\n");
+ if (WARN_ONCE(ret, "error on probing function entry.")) {
warn++;
} else {
/* Enable trace point */
tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
- if (WARN_ON_ONCE(tk == NULL)) {
- pr_warn("error on getting new probe.\n");
+ if (WARN_ONCE(tk == NULL, "error on probing function entry.")) {
warn++;
} else {
file = find_trace_probe_file(tk, top_trace_array());
- if (WARN_ON_ONCE(file == NULL)) {
- pr_warn("error on getting probe file.\n");
+ if (WARN_ONCE(file == NULL, "error on getting probe file.")) {
warn++;
} else
enable_trace_kprobe(
@@ -1984,19 +2088,16 @@ static __init int kprobe_trace_self_tests_init(void)
}
ret = create_or_delete_trace_kprobe("r:testprobe2 kprobe_trace_selftest_target $retval");
- if (WARN_ON_ONCE(ret)) {
- pr_warn("error on probing function return.\n");
+ if (WARN_ONCE(ret, "error on probing function return.")) {
warn++;
} else {
/* Enable trace point */
tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM);
- if (WARN_ON_ONCE(tk == NULL)) {
- pr_warn("error on getting 2nd new probe.\n");
+ if (WARN_ONCE(tk == NULL, "error on getting 2nd new probe.")) {
warn++;
} else {
file = find_trace_probe_file(tk, top_trace_array());
- if (WARN_ON_ONCE(file == NULL)) {
- pr_warn("error on getting probe file.\n");
+ if (WARN_ONCE(file == NULL, "error on getting probe file.")) {
warn++;
} else
enable_trace_kprobe(
@@ -2019,18 +2120,15 @@ static __init int kprobe_trace_self_tests_init(void)
/* Disable trace points before removing it */
tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
- if (WARN_ON_ONCE(tk == NULL)) {
- pr_warn("error on getting test probe.\n");
+ if (WARN_ONCE(tk == NULL, "error on getting test probe.")) {
warn++;
} else {
- if (trace_kprobe_nhit(tk) != 1) {
- pr_warn("incorrect number of testprobe hits\n");
+ if (WARN_ONCE(trace_kprobe_nhit(tk) != 1,
+ "incorrect number of testprobe hits."))
warn++;
- }
file = find_trace_probe_file(tk, top_trace_array());
- if (WARN_ON_ONCE(file == NULL)) {
- pr_warn("error on getting probe file.\n");
+ if (WARN_ONCE(file == NULL, "error on getting probe file.")) {
warn++;
} else
disable_trace_kprobe(
@@ -2038,18 +2136,15 @@ static __init int kprobe_trace_self_tests_init(void)
}
tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM);
- if (WARN_ON_ONCE(tk == NULL)) {
- pr_warn("error on getting 2nd test probe.\n");
+ if (WARN_ONCE(tk == NULL, "error on getting 2nd test probe.")) {
warn++;
} else {
- if (trace_kprobe_nhit(tk) != 1) {
- pr_warn("incorrect number of testprobe2 hits\n");
+ if (WARN_ONCE(trace_kprobe_nhit(tk) != 1,
+ "incorrect number of testprobe2 hits."))
warn++;
- }
file = find_trace_probe_file(tk, top_trace_array());
- if (WARN_ON_ONCE(file == NULL)) {
- pr_warn("error on getting probe file.\n");
+ if (WARN_ONCE(file == NULL, "error on getting probe file.")) {
warn++;
} else
disable_trace_kprobe(
@@ -2057,23 +2152,15 @@ static __init int kprobe_trace_self_tests_init(void)
}
ret = create_or_delete_trace_kprobe("-:testprobe");
- if (WARN_ON_ONCE(ret)) {
- pr_warn("error on deleting a probe.\n");
+ if (WARN_ONCE(ret, "error on deleting a probe."))
warn++;
- }
ret = create_or_delete_trace_kprobe("-:testprobe2");
- if (WARN_ON_ONCE(ret)) {
- pr_warn("error on deleting a probe.\n");
+ if (WARN_ONCE(ret, "error on deleting a probe."))
warn++;
- }
+
end:
- ret = dyn_events_release_all(&trace_kprobe_ops);
- if (WARN_ON_ONCE(ret)) {
- pr_warn("error on cleaning up probes.\n");
- warn++;
- }
/*
* Wait for the optimizer work to finish. Otherwise it might fiddle
* with probes in already freed __init text.
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 64e77b513697..ba5858866b2f 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -294,7 +294,6 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
struct trace_array_cpu *data,
struct mmiotrace_rw *rw)
{
- struct trace_event_call *call = &event_mmiotrace_rw;
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_rw *entry;
@@ -310,8 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->rw = *rw;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
+ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
}
void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -325,7 +323,6 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
struct trace_array_cpu *data,
struct mmiotrace_map *map)
{
- struct trace_event_call *call = &event_mmiotrace_map;
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_map *entry;
@@ -341,8 +338,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->map = *map;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
+ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
}
void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index a8e28f9b9271..f3a2722ee4c0 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -228,6 +228,11 @@ static inline struct osnoise_variables *this_cpu_osn_var(void)
return this_cpu_ptr(&per_cpu_osnoise_var);
}
+/*
+ * Protect the interface.
+ */
+static struct mutex interface_lock;
+
#ifdef CONFIG_TIMERLAT_TRACER
/*
* Runtime information for the timer mode.
@@ -259,14 +264,20 @@ static inline void tlat_var_reset(void)
{
struct timerlat_variables *tlat_var;
int cpu;
+
+ /* Synchronize with the timerlat interfaces */
+ mutex_lock(&interface_lock);
/*
* So far, all the values are initialized as 0, so
* zeroing the structure is perfect.
*/
for_each_cpu(cpu, cpu_online_mask) {
tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu);
+ if (tlat_var->kthread)
+ hrtimer_cancel(&tlat_var->timer);
memset(tlat_var, 0, sizeof(*tlat_var));
}
+ mutex_unlock(&interface_lock);
}
#else /* CONFIG_TIMERLAT_TRACER */
#define tlat_var_reset() do {} while (0)
@@ -332,11 +343,6 @@ struct timerlat_sample {
#endif
/*
- * Protect the interface.
- */
-static struct mutex interface_lock;
-
-/*
* Tracer data.
*/
static struct osnoise_data {
@@ -493,7 +499,6 @@ static void print_osnoise_headers(struct seq_file *s)
static void
__trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer)
{
- struct trace_event_call *call = &event_osnoise;
struct ring_buffer_event *event;
struct osnoise_entry *entry;
@@ -511,8 +516,7 @@ __trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffe
entry->softirq_count = sample->softirq_count;
entry->thread_count = sample->thread_count;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit_nostack(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
}
/*
@@ -572,7 +576,6 @@ static void print_timerlat_headers(struct seq_file *s)
static void
__trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer)
{
- struct trace_event_call *call = &event_osnoise;
struct ring_buffer_event *event;
struct timerlat_entry *entry;
@@ -585,8 +588,7 @@ __trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buf
entry->context = sample->context;
entry->timer_latency = sample->timer_latency;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit_nostack(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
}
/*
@@ -648,7 +650,6 @@ static void timerlat_save_stack(int skip)
static void
__timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, unsigned int size)
{
- struct trace_event_call *call = &event_osnoise;
struct ring_buffer_event *event;
struct stack_entry *entry;
@@ -662,8 +663,7 @@ __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, u
memcpy(&entry->caller, fstack->calls, size);
entry->size = fstack->nr_entries;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit_nostack(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
}
/*
@@ -1229,6 +1229,8 @@ static void trace_sched_migrate_callback(void *data, struct task_struct *p, int
}
}
+static bool monitor_enabled;
+
static int register_migration_monitor(void)
{
int ret = 0;
@@ -1237,16 +1239,25 @@ static int register_migration_monitor(void)
* Timerlat thread migration check is only required when running timerlat in user-space.
* Thus, enable callback only if timerlat is set with no workload.
*/
- if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options))
+ if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) {
+ if (WARN_ON_ONCE(monitor_enabled))
+ return 0;
+
ret = register_trace_sched_migrate_task(trace_sched_migrate_callback, NULL);
+ if (!ret)
+ monitor_enabled = true;
+ }
return ret;
}
static void unregister_migration_monitor(void)
{
- if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options))
- unregister_trace_sched_migrate_task(trace_sched_migrate_callback, NULL);
+ if (!monitor_enabled)
+ return;
+
+ unregister_trace_sched_migrate_task(trace_sched_migrate_callback, NULL);
+ monitor_enabled = false;
}
#else
static int register_migration_monitor(void)
@@ -1444,9 +1455,9 @@ static int run_osnoise(void)
save_osn_sample_stats(osn_var, &s);
/*
- * if threshold is 0, use the default value of 5 us.
+ * if threshold is 0, use the default value of 1 us.
*/
- threshold = tracing_thresh ? : 5000;
+ threshold = tracing_thresh ? : 1000;
/*
* Apply PREEMPT and IRQ disabled options.
@@ -1535,7 +1546,7 @@ static int run_osnoise(void)
* This will eventually cause unwarranted noise as PREEMPT_RCU
* will force preemption as the means of ending the current
* grace period. We avoid this problem by calling
- * rcu_momentary_dyntick_idle(), which performs a zero duration
+ * rcu_momentary_eqs(), which performs a zero duration
* EQS allowing PREEMPT_RCU to end the current grace period.
* This call shouldn't be wrapped inside an RCU critical
* section.
@@ -1547,7 +1558,7 @@ static int run_osnoise(void)
if (!disable_irq)
local_irq_disable();
- rcu_momentary_dyntick_idle();
+ rcu_momentary_eqs();
if (!disable_irq)
local_irq_enable();
@@ -1612,6 +1623,7 @@ out:
static struct cpumask osnoise_cpumask;
static struct cpumask save_cpumask;
+static struct cpumask kthread_cpumask;
/*
* osnoise_sleep - sleep until the next period
@@ -1675,6 +1687,7 @@ static inline int osnoise_migration_pending(void)
*/
mutex_lock(&interface_lock);
this_cpu_osn_var()->kthread = NULL;
+ cpumask_clear_cpu(smp_processor_id(), &kthread_cpumask);
mutex_unlock(&interface_lock);
return 1;
@@ -1945,11 +1958,12 @@ static void stop_kthread(unsigned int cpu)
{
struct task_struct *kthread;
- kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread;
+ kthread = xchg_relaxed(&(per_cpu(per_cpu_osnoise_var, cpu).kthread), NULL);
if (kthread) {
- if (test_bit(OSN_WORKLOAD, &osnoise_options)) {
+ if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask) &&
+ !WARN_ON(!test_bit(OSN_WORKLOAD, &osnoise_options))) {
kthread_stop(kthread);
- } else {
+ } else if (!WARN_ON(test_bit(OSN_WORKLOAD, &osnoise_options))) {
/*
* This is a user thread waiting on the timerlat_fd. We need
* to close all users, and the best way to guarantee this is
@@ -1958,7 +1972,6 @@ static void stop_kthread(unsigned int cpu)
kill_pid(kthread->thread_pid, SIGKILL, 1);
put_task_struct(kthread);
}
- per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
} else {
/* if no workload, just return */
if (!test_bit(OSN_WORKLOAD, &osnoise_options)) {
@@ -1967,7 +1980,6 @@ static void stop_kthread(unsigned int cpu)
*/
per_cpu(per_cpu_osnoise_var, cpu).sampling = false;
barrier();
- return;
}
}
}
@@ -1999,6 +2011,10 @@ static int start_kthread(unsigned int cpu)
void *main = osnoise_main;
char comm[24];
+ /* Do not start a new thread if it is already running */
+ if (per_cpu(per_cpu_osnoise_var, cpu).kthread)
+ return 0;
+
if (timerlat_enabled()) {
snprintf(comm, 24, "timerlat/%d", cpu);
main = timerlat_main;
@@ -2021,6 +2037,7 @@ static int start_kthread(unsigned int cpu)
}
per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread;
+ cpumask_set_cpu(cpu, &kthread_cpumask);
return 0;
}
@@ -2048,8 +2065,15 @@ static int start_per_cpu_kthreads(void)
*/
cpumask_and(current_mask, cpu_online_mask, &osnoise_cpumask);
- for_each_possible_cpu(cpu)
- per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
+ for_each_possible_cpu(cpu) {
+ if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask)) {
+ struct task_struct *kthread;
+
+ kthread = xchg_relaxed(&(per_cpu(per_cpu_osnoise_var, cpu).kthread), NULL);
+ if (!WARN_ON(!kthread))
+ kthread_stop(kthread);
+ }
+ }
for_each_cpu(cpu, current_mask) {
retval = start_kthread(cpu);
@@ -2070,24 +2094,21 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy)
{
unsigned int cpu = smp_processor_id();
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
if (!osnoise_has_registered_instances())
- goto out_unlock_trace;
+ return;
- mutex_lock(&interface_lock);
- cpus_read_lock();
+ guard(mutex)(&interface_lock);
+ guard(cpus_read_lock)();
+
+ if (!cpu_online(cpu))
+ return;
if (!cpumask_test_cpu(cpu, &osnoise_cpumask))
- goto out_unlock;
+ return;
start_kthread(cpu);
-
-out_unlock:
- cpus_read_unlock();
- mutex_unlock(&interface_lock);
-out_unlock_trace:
- mutex_unlock(&trace_types_lock);
}
static DECLARE_WORK(osnoise_hotplug_work, osnoise_hotplug_workfn);
@@ -2285,31 +2306,22 @@ static ssize_t
osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count,
loff_t *ppos)
{
- char *mask_str;
+ char *mask_str __free(kfree) = NULL;
int len;
- mutex_lock(&interface_lock);
+ guard(mutex)(&interface_lock);
len = snprintf(NULL, 0, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)) + 1;
mask_str = kmalloc(len, GFP_KERNEL);
- if (!mask_str) {
- count = -ENOMEM;
- goto out_unlock;
- }
+ if (!mask_str)
+ return -ENOMEM;
len = snprintf(mask_str, len, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask));
- if (len >= count) {
- count = -EINVAL;
- goto out_free;
- }
+ if (len >= count)
+ return -EINVAL;
count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
-out_free:
- kfree(mask_str);
-out_unlock:
- mutex_unlock(&interface_lock);
-
return count;
}
@@ -2579,7 +2591,8 @@ static int timerlat_fd_release(struct inode *inode, struct file *file)
osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu);
tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu);
- hrtimer_cancel(&tlat_var->timer);
+ if (tlat_var->kthread)
+ hrtimer_cancel(&tlat_var->timer);
memset(tlat_var, 0, sizeof(*tlat_var));
osn_var->sampling = 0;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index d8b302d01083..03d56f711ad1 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -317,10 +317,14 @@ EXPORT_SYMBOL(trace_raw_output_prep);
void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...)
{
+ struct trace_seq *s = &iter->seq;
va_list ap;
+ if (ignore_event(iter))
+ return;
+
va_start(ap, fmt);
- trace_check_vprintf(iter, trace_event_format(iter, fmt), ap);
+ trace_seq_vprintf(s, trace_event_format(iter, fmt), ap);
va_end(ap);
}
EXPORT_SYMBOL(trace_event_printf);
@@ -460,20 +464,31 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
(entry->flags & TRACE_FLAG_IRQS_OFF && bh_off) ? 'D' :
(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
bh_off ? 'b' :
- (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
'.';
- switch (entry->flags & (TRACE_FLAG_NEED_RESCHED |
+ switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY |
TRACE_FLAG_PREEMPT_RESCHED)) {
+ case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED:
+ need_resched = 'B';
+ break;
case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED:
need_resched = 'N';
break;
+ case TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED:
+ need_resched = 'L';
+ break;
+ case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY:
+ need_resched = 'b';
+ break;
case TRACE_FLAG_NEED_RESCHED:
need_resched = 'n';
break;
case TRACE_FLAG_PREEMPT_RESCHED:
need_resched = 'p';
break;
+ case TRACE_FLAG_NEED_RESCHED_LAZY:
+ need_resched = 'l';
+ break;
default:
need_resched = '.';
break;
@@ -990,8 +1005,11 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
}
static void print_fn_trace(struct trace_seq *s, unsigned long ip,
- unsigned long parent_ip, int flags)
+ unsigned long parent_ip, long delta, int flags)
{
+ ip += delta;
+ parent_ip += delta;
+
seq_print_ip_sym(s, ip, flags);
if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) {
@@ -1009,7 +1027,7 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- print_fn_trace(s, field->ip, field->parent_ip, flags);
+ print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags);
trace_seq_putc(s, '\n');
return trace_handle_return(s);
@@ -1230,6 +1248,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
struct trace_seq *s = &iter->seq;
unsigned long *p;
unsigned long *end;
+ long delta = iter->tr->text_delta;
trace_assign_type(field, iter->ent);
end = (unsigned long *)((long)iter->ent + iter->ent_size);
@@ -1242,7 +1261,11 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
break;
trace_seq_puts(s, " => ");
- seq_print_ip_sym(s, *p, flags);
+ if ((*p) == FTRACE_TRAMPOLINE_MARKER) {
+ trace_seq_puts(s, "[FTRACE TRAMPOLINE]\n");
+ continue;
+ }
+ seq_print_ip_sym(s, (*p) + delta, flags);
trace_seq_putc(s, '\n');
}
@@ -1587,10 +1610,13 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
{
struct print_entry *field;
struct trace_seq *s = &iter->seq;
+ unsigned long ip;
trace_assign_type(field, iter->ent);
- seq_print_ip_sym(s, field->ip, flags);
+ ip = field->ip + iter->tr->text_delta;
+
+ seq_print_ip_sym(s, ip, flags);
trace_seq_printf(s, ": %s", field->buf);
return trace_handle_return(s);
@@ -1674,7 +1700,7 @@ trace_func_repeats_print(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- print_fn_trace(s, field->ip, field->parent_ip, flags);
+ print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags);
trace_seq_printf(s, " (repeats: %u, last_ts:", field->count);
trace_print_time(s, iter,
iter->ts - FUNC_REPEATS_GET_DELTA_TS(field));
diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index e37446f7916e..0c42b15c3800 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/ftrace.h>
#include <linux/kprobes.h>
+#include <linux/hardirq.h>
#include "trace.h"
#define CREATE_TRACE_POINTS
@@ -20,13 +21,29 @@
* tooling: these calls will only happen with RCU enabled, which can
* use a regular tracepoint.
*
- * On older architectures, use the rcuidle tracing methods (which
- * aren't NMI-safe - so exclude NMI contexts):
+ * On older architectures, RCU may not be watching in idle. In that
+ * case, wake up RCU to watch while calling the tracepoint. These
+ * aren't NMI-safe - so exclude NMI contexts:
*/
#ifdef CONFIG_ARCH_WANTS_NO_INSTR
-#define trace(point) trace_##point
+#define trace(point, args) trace_##point(args)
#else
-#define trace(point) if (!in_nmi()) trace_##point##_rcuidle
+#define trace(point, args) \
+ do { \
+ if (trace_##point##_enabled()) { \
+ bool exit_rcu = false; \
+ if (in_nmi()) \
+ break; \
+ if (!IS_ENABLED(CONFIG_TINY_RCU) && \
+ is_idle_task(current)) { \
+ ct_irq_enter(); \
+ exit_rcu = true; \
+ } \
+ trace_##point(args); \
+ if (exit_rcu) \
+ ct_irq_exit(); \
+ } \
+ } while (0)
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
@@ -42,7 +59,7 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu);
void trace_hardirqs_on_prepare(void)
{
if (this_cpu_read(tracing_irq_cpu)) {
- trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1);
+ trace(irq_enable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1));
tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
this_cpu_write(tracing_irq_cpu, 0);
}
@@ -53,7 +70,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_prepare);
void trace_hardirqs_on(void)
{
if (this_cpu_read(tracing_irq_cpu)) {
- trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1);
+ trace(irq_enable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1));
tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
this_cpu_write(tracing_irq_cpu, 0);
}
@@ -75,7 +92,7 @@ void trace_hardirqs_off_finish(void)
if (!this_cpu_read(tracing_irq_cpu)) {
this_cpu_write(tracing_irq_cpu, 1);
tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
- trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1);
+ trace(irq_disable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1));
}
}
@@ -89,7 +106,7 @@ void trace_hardirqs_off(void)
if (!this_cpu_read(tracing_irq_cpu)) {
this_cpu_write(tracing_irq_cpu, 1);
tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
- trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1);
+ trace(irq_disable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1));
}
}
EXPORT_SYMBOL(trace_hardirqs_off);
@@ -100,13 +117,13 @@ NOKPROBE_SYMBOL(trace_hardirqs_off);
void trace_preempt_on(unsigned long a0, unsigned long a1)
{
- trace(preempt_enable)(a0, a1);
+ trace(preempt_enable, TP_ARGS(a0, a1));
tracer_preempt_on(a0, a1);
}
void trace_preempt_off(unsigned long a0, unsigned long a1)
{
- trace(preempt_disable)(a0, a1);
+ trace(preempt_disable, TP_ARGS(a0, a1));
tracer_preempt_off(a0, a1);
}
#endif
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 34289f9c6707..8f58ee1e8858 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -12,6 +12,7 @@
#define pr_fmt(fmt) "trace_probe: " fmt
#include <linux/bpf.h>
+#include <linux/fs.h>
#include "trace_btf.h"
#include "trace_probe.h"
@@ -275,7 +276,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
}
trace_probe_log_err(offset, NO_EVENT_NAME);
return -EINVAL;
- } else if (len > MAX_EVENT_NAME_LEN) {
+ } else if (len >= MAX_EVENT_NAME_LEN) {
trace_probe_log_err(offset, EVENT_TOO_LONG);
return -EINVAL;
}
@@ -553,6 +554,10 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
anon_offs = 0;
field = btf_find_struct_member(ctx->btf, type, fieldname,
&anon_offs);
+ if (IS_ERR(field)) {
+ trace_probe_log_err(ctx->offset, BAD_BTF_TID);
+ return PTR_ERR(field);
+ }
if (!field) {
trace_probe_log_err(ctx->offset, NO_BTF_FIELD);
return -ENOENT;
@@ -594,6 +599,8 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
return 0;
}
+static int __store_entry_arg(struct trace_probe *tp, int argnum);
+
static int parse_btf_arg(char *varname,
struct fetch_insn **pcode, struct fetch_insn *end,
struct traceprobe_parse_context *ctx)
@@ -618,11 +625,7 @@ static int parse_btf_arg(char *varname,
return -EOPNOTSUPP;
}
- if (ctx->flags & TPARG_FL_RETURN) {
- if (strcmp(varname, "$retval") != 0) {
- trace_probe_log_err(ctx->offset, NO_BTFARG);
- return -ENOENT;
- }
+ if (ctx->flags & TPARG_FL_RETURN && !strcmp(varname, "$retval")) {
code->op = FETCH_OP_RETVAL;
/* Check whether the function return type is not void */
if (query_btf_context(ctx) == 0) {
@@ -654,11 +657,21 @@ static int parse_btf_arg(char *varname,
const char *name = btf_name_by_offset(ctx->btf, params[i].name_off);
if (name && !strcmp(name, varname)) {
- code->op = FETCH_OP_ARG;
- if (ctx->flags & TPARG_FL_TPOINT)
- code->param = i + 1;
- else
- code->param = i;
+ if (tparg_is_function_entry(ctx->flags)) {
+ code->op = FETCH_OP_ARG;
+ if (ctx->flags & TPARG_FL_TPOINT)
+ code->param = i + 1;
+ else
+ code->param = i;
+ } else if (tparg_is_function_return(ctx->flags)) {
+ code->op = FETCH_OP_EDATA;
+ ret = __store_entry_arg(ctx->tp, i);
+ if (ret < 0) {
+ /* internal error */
+ return ret;
+ }
+ code->offset = ret;
+ }
tid = params[i].type;
goto found;
}
@@ -755,6 +768,110 @@ static int check_prepare_btf_string_fetch(char *typename,
#endif
+#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+
+static int __store_entry_arg(struct trace_probe *tp, int argnum)
+{
+ struct probe_entry_arg *earg = tp->entry_arg;
+ bool match = false;
+ int i, offset;
+
+ if (!earg) {
+ earg = kzalloc(sizeof(*tp->entry_arg), GFP_KERNEL);
+ if (!earg)
+ return -ENOMEM;
+ earg->size = 2 * tp->nr_args + 1;
+ earg->code = kcalloc(earg->size, sizeof(struct fetch_insn),
+ GFP_KERNEL);
+ if (!earg->code) {
+ kfree(earg);
+ return -ENOMEM;
+ }
+ /* Fill the code buffer with 'end' to simplify it */
+ for (i = 0; i < earg->size; i++)
+ earg->code[i].op = FETCH_OP_END;
+ tp->entry_arg = earg;
+ }
+
+ offset = 0;
+ for (i = 0; i < earg->size - 1; i++) {
+ switch (earg->code[i].op) {
+ case FETCH_OP_END:
+ earg->code[i].op = FETCH_OP_ARG;
+ earg->code[i].param = argnum;
+ earg->code[i + 1].op = FETCH_OP_ST_EDATA;
+ earg->code[i + 1].offset = offset;
+ return offset;
+ case FETCH_OP_ARG:
+ match = (earg->code[i].param == argnum);
+ break;
+ case FETCH_OP_ST_EDATA:
+ offset = earg->code[i].offset;
+ if (match)
+ return offset;
+ offset += sizeof(unsigned long);
+ break;
+ default:
+ break;
+ }
+ }
+ return -ENOSPC;
+}
+
+int traceprobe_get_entry_data_size(struct trace_probe *tp)
+{
+ struct probe_entry_arg *earg = tp->entry_arg;
+ int i, size = 0;
+
+ if (!earg)
+ return 0;
+
+ for (i = 0; i < earg->size; i++) {
+ switch (earg->code[i].op) {
+ case FETCH_OP_END:
+ goto out;
+ case FETCH_OP_ST_EDATA:
+ size = earg->code[i].offset + sizeof(unsigned long);
+ break;
+ default:
+ break;
+ }
+ }
+out:
+ return size;
+}
+
+void store_trace_entry_data(void *edata, struct trace_probe *tp, struct pt_regs *regs)
+{
+ struct probe_entry_arg *earg = tp->entry_arg;
+ unsigned long val = 0;
+ int i;
+
+ if (!earg)
+ return;
+
+ for (i = 0; i < earg->size; i++) {
+ struct fetch_insn *code = &earg->code[i];
+
+ switch (code->op) {
+ case FETCH_OP_ARG:
+ val = regs_get_kernel_argument(regs, code->param);
+ break;
+ case FETCH_OP_ST_EDATA:
+ *(unsigned long *)((unsigned long)edata + code->offset) = val;
+ break;
+ case FETCH_OP_END:
+ goto end;
+ default:
+ break;
+ }
+ }
+end:
+ return;
+}
+NOKPROBE_SYMBOL(store_trace_entry_data)
+#endif
+
#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
/* Parse $vars. @orig_arg points '$', which syncs to @ctx->offset */
@@ -830,7 +947,7 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t,
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
len = str_has_prefix(arg, "arg");
- if (len && tparg_is_function_entry(ctx->flags)) {
+ if (len) {
ret = kstrtoul(arg + len, 10, &param);
if (ret)
goto inval;
@@ -839,15 +956,29 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t,
err = TP_ERR_BAD_ARG_NUM;
goto inval;
}
+ param--; /* argN starts from 1, but internal arg[N] starts from 0 */
- code->op = FETCH_OP_ARG;
- code->param = (unsigned int)param - 1;
- /*
- * The tracepoint probe will probe a stub function, and the
- * first parameter of the stub is a dummy and should be ignored.
- */
- if (ctx->flags & TPARG_FL_TPOINT)
- code->param++;
+ if (tparg_is_function_entry(ctx->flags)) {
+ code->op = FETCH_OP_ARG;
+ code->param = (unsigned int)param;
+ /*
+ * The tracepoint probe will probe a stub function, and the
+ * first parameter of the stub is a dummy and should be ignored.
+ */
+ if (ctx->flags & TPARG_FL_TPOINT)
+ code->param++;
+ } else if (tparg_is_function_return(ctx->flags)) {
+ /* function entry argument access from return probe */
+ ret = __store_entry_arg(ctx->tp, param);
+ if (ret < 0) /* This error should be an internal error */
+ return ret;
+
+ code->op = FETCH_OP_EDATA;
+ code->offset = ret;
+ } else {
+ err = TP_ERR_NOFENTRY_ARGS;
+ goto inval;
+ }
return 0;
}
#endif
@@ -1037,7 +1168,8 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
break;
default:
if (isalpha(arg[0]) || arg[0] == '_') { /* BTF variable */
- if (!tparg_is_function_entry(ctx->flags)) {
+ if (!tparg_is_function_entry(ctx->flags) &&
+ !tparg_is_function_return(ctx->flags)) {
trace_probe_log_err(ctx->offset, NOSUP_BTFARG);
return -EINVAL;
}
@@ -1053,8 +1185,6 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
return ret;
}
-#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
-
/* Bitfield type needs to be parsed into a fetch function */
static int __parse_bitfield_probe_arg(const char *bf,
const struct fetch_type *t,
@@ -1090,67 +1220,45 @@ static int __parse_bitfield_probe_arg(const char *bf,
return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
}
-/* String length checking wrapper */
-static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
- struct probe_arg *parg,
- struct traceprobe_parse_context *ctx)
+/* Split type part from @arg and return it. */
+static char *parse_probe_arg_type(char *arg, struct probe_arg *parg,
+ struct traceprobe_parse_context *ctx)
{
- struct fetch_insn *code, *scode, *tmp = NULL;
- char *t, *t2, *t3;
- int ret, len;
- char *arg;
+ char *t = NULL, *t2, *t3;
+ int offs;
- arg = kstrdup(argv, GFP_KERNEL);
- if (!arg)
- return -ENOMEM;
-
- ret = -EINVAL;
- len = strlen(arg);
- if (len > MAX_ARGSTR_LEN) {
- trace_probe_log_err(ctx->offset, ARG_TOO_LONG);
- goto out;
- } else if (len == 0) {
- trace_probe_log_err(ctx->offset, NO_ARG_BODY);
- goto out;
- }
-
- ret = -ENOMEM;
- parg->comm = kstrdup(arg, GFP_KERNEL);
- if (!parg->comm)
- goto out;
-
- ret = -EINVAL;
t = strchr(arg, ':');
if (t) {
- *t = '\0';
- t2 = strchr(++t, '[');
+ *t++ = '\0';
+ t2 = strchr(t, '[');
if (t2) {
*t2++ = '\0';
t3 = strchr(t2, ']');
if (!t3) {
- int offs = t2 + strlen(t2) - arg;
+ offs = t2 + strlen(t2) - arg;
trace_probe_log_err(ctx->offset + offs,
ARRAY_NO_CLOSE);
- goto out;
+ return ERR_PTR(-EINVAL);
} else if (t3[1] != '\0') {
trace_probe_log_err(ctx->offset + t3 + 1 - arg,
BAD_ARRAY_SUFFIX);
- goto out;
+ return ERR_PTR(-EINVAL);
}
*t3 = '\0';
if (kstrtouint(t2, 0, &parg->count) || !parg->count) {
trace_probe_log_err(ctx->offset + t2 - arg,
BAD_ARRAY_NUM);
- goto out;
+ return ERR_PTR(-EINVAL);
}
if (parg->count > MAX_ARRAY_LEN) {
trace_probe_log_err(ctx->offset + t2 - arg,
ARRAY_TOO_BIG);
- goto out;
+ return ERR_PTR(-EINVAL);
}
}
}
+ offs = t ? t - arg : 0;
/*
* Since $comm and immediate string can not be dereferenced,
@@ -1161,74 +1269,52 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
strncmp(arg, "\\\"", 2) == 0)) {
/* The type of $comm must be "string", and not an array type. */
if (parg->count || (t && strcmp(t, "string"))) {
- trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0),
- NEED_STRING_TYPE);
- goto out;
+ trace_probe_log_err(ctx->offset + offs, NEED_STRING_TYPE);
+ return ERR_PTR(-EINVAL);
}
parg->type = find_fetch_type("string", ctx->flags);
} else
parg->type = find_fetch_type(t, ctx->flags);
+
if (!parg->type) {
- trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_TYPE);
- goto out;
+ trace_probe_log_err(ctx->offset + offs, BAD_TYPE);
+ return ERR_PTR(-EINVAL);
}
- code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL);
- if (!code)
- goto out;
- code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;
-
- ctx->last_type = NULL;
- ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1],
- ctx);
- if (ret)
- goto fail;
-
- /* Update storing type if BTF is available */
- if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) &&
- ctx->last_type) {
- if (!t) {
- parg->type = find_fetch_type_from_btf_type(ctx);
- } else if (strstr(t, "string")) {
- ret = check_prepare_btf_string_fetch(t, &code, ctx);
- if (ret)
- goto fail;
- }
- }
- parg->offset = *size;
- *size += parg->type->size * (parg->count ?: 1);
+ return t;
+}
- if (parg->count) {
- len = strlen(parg->type->fmttype) + 6;
- parg->fmt = kmalloc(len, GFP_KERNEL);
- if (!parg->fmt) {
- ret = -ENOMEM;
- goto out;
- }
- snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype,
- parg->count);
- }
+/* After parsing, adjust the fetch_insn according to the probe_arg */
+static int finalize_fetch_insn(struct fetch_insn *code,
+ struct probe_arg *parg,
+ char *type,
+ int type_offset,
+ struct traceprobe_parse_context *ctx)
+{
+ struct fetch_insn *scode;
+ int ret;
- ret = -EINVAL;
/* Store operation */
if (parg->type->is_string) {
+ /* Check bad combination of the type and the last fetch_insn. */
if (!strcmp(parg->type->name, "symstr")) {
if (code->op != FETCH_OP_REG && code->op != FETCH_OP_STACK &&
code->op != FETCH_OP_RETVAL && code->op != FETCH_OP_ARG &&
code->op != FETCH_OP_DEREF && code->op != FETCH_OP_TP_ARG) {
- trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0),
+ trace_probe_log_err(ctx->offset + type_offset,
BAD_SYMSTRING);
- goto fail;
+ return -EINVAL;
}
} else {
if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF &&
code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM &&
code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) {
- trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0),
+ trace_probe_log_err(ctx->offset + type_offset,
BAD_STRING);
- goto fail;
+ return -EINVAL;
}
}
+
if (!strcmp(parg->type->name, "symstr") ||
(code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM ||
code->op == FETCH_OP_DATA) || code->op == FETCH_OP_TP_ARG ||
@@ -1244,9 +1330,10 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
code++;
if (code->op != FETCH_OP_NOP) {
trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
- goto fail;
+ return -EINVAL;
}
}
+
/* If op == DEREF, replace it with STRING */
if (!strcmp(parg->type->name, "ustring") ||
code->op == FETCH_OP_UDEREF)
@@ -1267,47 +1354,128 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
code++;
if (code->op != FETCH_OP_NOP) {
trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
- goto fail;
+ return -E2BIG;
}
code->op = FETCH_OP_ST_RAW;
code->size = parg->type->size;
}
+
+ /* Save storing fetch_insn. */
scode = code;
+
/* Modify operation */
- if (t != NULL) {
- ret = __parse_bitfield_probe_arg(t, parg->type, &code);
+ if (type != NULL) {
+ /* Bitfield needs a special fetch_insn. */
+ ret = __parse_bitfield_probe_arg(type, parg->type, &code);
if (ret) {
- trace_probe_log_err(ctx->offset + t - arg, BAD_BITFIELD);
- goto fail;
+ trace_probe_log_err(ctx->offset + type_offset, BAD_BITFIELD);
+ return ret;
}
} else if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) &&
ctx->last_type) {
+ /* If user not specified the type, try parsing BTF bitfield. */
ret = parse_btf_bitfield(&code, ctx);
if (ret)
- goto fail;
+ return ret;
}
- ret = -EINVAL;
+
/* Loop(Array) operation */
if (parg->count) {
if (scode->op != FETCH_OP_ST_MEM &&
scode->op != FETCH_OP_ST_STRING &&
scode->op != FETCH_OP_ST_USTRING) {
- trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0),
- BAD_STRING);
- goto fail;
+ trace_probe_log_err(ctx->offset + type_offset, BAD_STRING);
+ return -EINVAL;
}
code++;
if (code->op != FETCH_OP_NOP) {
trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
- goto fail;
+ return -E2BIG;
}
code->op = FETCH_OP_LP_ARRAY;
code->param = parg->count;
}
+
+ /* Finalize the fetch_insn array. */
code++;
code->op = FETCH_OP_END;
- ret = 0;
+ return 0;
+}
+
+/* String length checking wrapper */
+static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
+ struct probe_arg *parg,
+ struct traceprobe_parse_context *ctx)
+{
+ struct fetch_insn *code, *tmp = NULL;
+ char *type, *arg __free(kfree) = NULL;
+ int ret, len;
+
+ len = strlen(argv);
+ if (len > MAX_ARGSTR_LEN) {
+ trace_probe_log_err(ctx->offset, ARG_TOO_LONG);
+ return -E2BIG;
+ } else if (len == 0) {
+ trace_probe_log_err(ctx->offset, NO_ARG_BODY);
+ return -EINVAL;
+ }
+
+ arg = kstrdup(argv, GFP_KERNEL);
+ if (!arg)
+ return -ENOMEM;
+
+ parg->comm = kstrdup(arg, GFP_KERNEL);
+ if (!parg->comm)
+ return -ENOMEM;
+
+ type = parse_probe_arg_type(arg, parg, ctx);
+ if (IS_ERR(type))
+ return PTR_ERR(type);
+
+ code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL);
+ if (!code)
+ return -ENOMEM;
+ code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;
+
+ ctx->last_type = NULL;
+ ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1],
+ ctx);
+ if (ret < 0)
+ goto fail;
+
+ /* Update storing type if BTF is available */
+ if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) &&
+ ctx->last_type) {
+ if (!type) {
+ parg->type = find_fetch_type_from_btf_type(ctx);
+ } else if (strstr(type, "string")) {
+ ret = check_prepare_btf_string_fetch(type, &code, ctx);
+ if (ret)
+ goto fail;
+ }
+ }
+ parg->offset = *size;
+ *size += parg->type->size * (parg->count ?: 1);
+
+ if (parg->count) {
+ len = strlen(parg->type->fmttype) + 6;
+ parg->fmt = kmalloc(len, GFP_KERNEL);
+ if (!parg->fmt) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype,
+ parg->count);
+ }
+
+ ret = finalize_fetch_insn(code, parg, type, type ? type - arg : 0, ctx);
+ if (ret < 0)
+ goto fail;
+
+ for (; code < tmp + FETCH_INSN_MAX; code++)
+ if (code->op == FETCH_OP_END)
+ break;
/* Shrink down the code buffer */
parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL);
if (!parg->code)
@@ -1316,15 +1484,13 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
memcpy(parg->code, tmp, sizeof(*code) * (code - tmp + 1));
fail:
- if (ret) {
+ if (ret < 0) {
for (code = tmp; code < tmp + FETCH_INSN_MAX; code++)
if (code->op == FETCH_NOP_SYMBOL ||
code->op == FETCH_OP_DATA)
kfree(code->data);
}
kfree(tmp);
-out:
- kfree(arg);
return ret;
}
@@ -1379,9 +1545,7 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg,
struct probe_arg *parg = &tp->args[i];
const char *body;
- /* Increment count for freeing args in error case */
- tp->nr_args++;
-
+ ctx->tp = tp;
body = strchr(arg, '=');
if (body) {
if (body - arg > MAX_ARG_NAME_LEN) {
@@ -1438,7 +1602,8 @@ static int argv_has_var_arg(int argc, const char *argv[], int *args_idx,
if (str_has_prefix(argv[i], "$arg")) {
trace_probe_log_set_index(i + 2);
- if (!tparg_is_function_entry(ctx->flags)) {
+ if (!tparg_is_function_entry(ctx->flags) &&
+ !tparg_is_function_return(ctx->flags)) {
trace_probe_log_err(0, NOFENTRY_ARGS);
return -EINVAL;
}
@@ -1495,7 +1660,7 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[],
{
const struct btf_param *params = NULL;
int i, j, n, used, ret, args_idx = -1;
- const char **new_argv = NULL;
+ const char **new_argv __free(kfree) = NULL;
ret = argv_has_var_arg(argc, argv, &args_idx, ctx);
if (ret < 0)
@@ -1534,7 +1699,7 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[],
ret = sprint_nth_btf_arg(n, "", buf + used,
bufsize - used, ctx);
if (ret < 0)
- goto error;
+ return ERR_PTR(ret);
new_argv[j++] = buf + used;
used += ret + 1;
@@ -1548,25 +1713,78 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[],
n = simple_strtoul(argv[i] + 4, &type, 10);
if (type && !(*type == ':' || *type == '\0')) {
trace_probe_log_err(0, BAD_VAR);
- ret = -ENOENT;
- goto error;
+ return ERR_PTR(-ENOENT);
}
/* Note: $argN starts from $arg1 */
ret = sprint_nth_btf_arg(n - 1, type, buf + used,
bufsize - used, ctx);
if (ret < 0)
- goto error;
+ return ERR_PTR(ret);
new_argv[j++] = buf + used;
used += ret + 1;
} else
new_argv[j++] = argv[i];
}
- return new_argv;
+ return_ptr(new_argv);
+}
-error:
- kfree(new_argv);
- return ERR_PTR(ret);
+/* @buf: *buf must be equal to NULL. Caller must to free *buf */
+int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf)
+{
+ int i, used, ret;
+ const int bufsize = MAX_DENTRY_ARGS_LEN;
+ char *tmpbuf __free(kfree) = NULL;
+
+ if (*buf)
+ return -EINVAL;
+
+ used = 0;
+ for (i = 0; i < argc; i++) {
+ char *tmp __free(kfree) = NULL;
+ char *equal;
+ size_t arg_len;
+
+ if (!glob_match("*:%p[dD]", argv[i]))
+ continue;
+
+ if (!tmpbuf) {
+ tmpbuf = kmalloc(bufsize, GFP_KERNEL);
+ if (!tmpbuf)
+ return -ENOMEM;
+ }
+
+ tmp = kstrdup(argv[i], GFP_KERNEL);
+ if (!tmp)
+ return -ENOMEM;
+
+ equal = strchr(tmp, '=');
+ if (equal)
+ *equal = '\0';
+ arg_len = strlen(argv[i]);
+ tmp[arg_len - 4] = '\0';
+ if (argv[i][arg_len - 1] == 'd')
+ ret = snprintf(tmpbuf + used, bufsize - used,
+ "%s%s+0x0(+0x%zx(%s)):string",
+ equal ? tmp : "", equal ? "=" : "",
+ offsetof(struct dentry, d_name.name),
+ equal ? equal + 1 : tmp);
+ else
+ ret = snprintf(tmpbuf + used, bufsize - used,
+ "%s%s+0x0(+0x%zx(+0x%zx(%s))):string",
+ equal ? tmp : "", equal ? "=" : "",
+ offsetof(struct dentry, d_name.name),
+ offsetof(struct file, f_path.dentry),
+ equal ? equal + 1 : tmp);
+
+ if (ret >= bufsize - used)
+ return -ENOMEM;
+ argv[i] = tmpbuf + used;
+ used += ret + 1;
+ }
+
+ *buf = no_free_ptr(tmpbuf);
+ return 0;
}
void traceprobe_finish_parse(struct traceprobe_parse_context *ctx)
@@ -1761,12 +1979,18 @@ void trace_probe_cleanup(struct trace_probe *tp)
for (i = 0; i < tp->nr_args; i++)
traceprobe_free_probe_arg(&tp->args[i]);
+ if (tp->entry_arg) {
+ kfree(tp->entry_arg->code);
+ kfree(tp->entry_arg);
+ tp->entry_arg = NULL;
+ }
+
if (tp->event)
trace_probe_unlink(tp);
}
int trace_probe_init(struct trace_probe *tp, const char *event,
- const char *group, bool alloc_filter)
+ const char *group, bool alloc_filter, int nargs)
{
struct trace_event_call *call;
size_t size = sizeof(struct trace_probe_event);
@@ -1802,6 +2026,11 @@ int trace_probe_init(struct trace_probe *tp, const char *event,
goto error;
}
+ tp->nr_args = nargs;
+ /* Make sure pointers in args[] are NULL */
+ if (nargs)
+ memset(tp->args, 0, sizeof(tp->args[0]) * nargs);
+
return 0;
error:
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index c1877d018269..96792bc4b092 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -34,8 +34,8 @@
#define MAX_ARRAY_LEN 64
#define MAX_ARG_NAME_LEN 32
#define MAX_BTF_ARGS_LEN 128
+#define MAX_DENTRY_ARGS_LEN 256
#define MAX_STRING_SIZE PATH_MAX
-#define MAX_ARG_BUF_LEN (MAX_TRACE_ARGS * MAX_ARG_NAME_LEN)
/* Reserved field names */
#define FIELD_STRING_IP "__probe_ip"
@@ -92,6 +92,7 @@ enum fetch_op {
FETCH_OP_ARG, /* Function argument : .param */
FETCH_OP_FOFFS, /* File offset: .immediate */
FETCH_OP_DATA, /* Allocated data: .data */
+ FETCH_OP_EDATA, /* Entry data: .offset */
// Stage 2 (dereference) op
FETCH_OP_DEREF, /* Dereference: .offset */
FETCH_OP_UDEREF, /* User-space Dereference: .offset */
@@ -102,6 +103,7 @@ enum fetch_op {
FETCH_OP_ST_STRING, /* String: .offset, .size */
FETCH_OP_ST_USTRING, /* User String: .offset, .size */
FETCH_OP_ST_SYMSTR, /* Kernel Symbol String: .offset, .size */
+ FETCH_OP_ST_EDATA, /* Store Entry Data: .offset */
// Stage 4 (modify) op
FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */
// Stage 5 (loop) op
@@ -232,6 +234,11 @@ struct probe_arg {
const struct fetch_type *type; /* Type of this argument */
};
+struct probe_entry_arg {
+ struct fetch_insn *code;
+ unsigned int size; /* The entry data size */
+};
+
struct trace_uprobe_filter {
rwlock_t rwlock;
int nr_systemwide;
@@ -253,6 +260,7 @@ struct trace_probe {
struct trace_probe_event *event;
ssize_t size; /* trace entry size */
unsigned int nr_args;
+ struct probe_entry_arg *entry_arg; /* This is only for return probe */
struct probe_arg args[];
};
@@ -338,7 +346,7 @@ static inline bool trace_probe_has_single_file(struct trace_probe *tp)
}
int trace_probe_init(struct trace_probe *tp, const char *event,
- const char *group, bool alloc_filter);
+ const char *group, bool alloc_filter, int nargs);
void trace_probe_cleanup(struct trace_probe *tp);
int trace_probe_append(struct trace_probe *tp, struct trace_probe *to);
void trace_probe_unlink(struct trace_probe *tp);
@@ -355,6 +363,18 @@ int trace_probe_create(const char *raw_command, int (*createfn)(int, const char
int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_args,
u8 *data, void *field);
+#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+int traceprobe_get_entry_data_size(struct trace_probe *tp);
+/* This is a runtime function to store entry data */
+void store_trace_entry_data(void *edata, struct trace_probe *tp, struct pt_regs *regs);
+#else /* !CONFIG_HAVE_FUNCTION_ARG_ACCESS_API */
+static inline int traceprobe_get_entry_data_size(struct trace_probe *tp)
+{
+ return 0;
+}
+#define store_trace_entry_data(edata, tp, regs) do { } while (0)
+#endif
+
#define trace_probe_for_each_link(pos, tp) \
list_for_each_entry(pos, &(tp)->event->files, list)
#define trace_probe_for_each_link_rcu(pos, tp) \
@@ -381,6 +401,11 @@ static inline bool tparg_is_function_entry(unsigned int flags)
return (flags & TPARG_FL_LOC_MASK) == (TPARG_FL_KERNEL | TPARG_FL_FENTRY);
}
+static inline bool tparg_is_function_return(unsigned int flags)
+{
+ return (flags & TPARG_FL_LOC_MASK) == (TPARG_FL_KERNEL | TPARG_FL_RETURN);
+}
+
struct traceprobe_parse_context {
struct trace_event_call *event;
/* BTF related parameters */
@@ -392,6 +417,7 @@ struct traceprobe_parse_context {
const struct btf_type *last_type; /* Saved type */
u32 last_bitoffs; /* Saved bitoffs */
u32 last_bitsize; /* Saved bitsize */
+ struct trace_probe *tp;
unsigned int flags;
int offset;
};
@@ -402,6 +428,7 @@ extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i,
const char **traceprobe_expand_meta_args(int argc, const char *argv[],
int *new_argc, char *buf, int bufsize,
struct traceprobe_parse_context *ctx);
+extern int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf);
extern int traceprobe_update_arg(struct probe_arg *arg);
extern void traceprobe_free_probe_arg(struct probe_arg *arg);
@@ -453,6 +480,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(NON_UNIQ_SYMBOL, "The symbol is not unique"), \
C(BAD_RETPROBE, "Retprobe address must be an function entry"), \
C(NO_TRACEPOINT, "Tracepoint is not found"), \
+ C(BAD_TP_NAME, "Invalid character in tracepoint name"),\
C(BAD_ADDR_SUFFIX, "Invalid probed address suffix"), \
C(NO_GROUP_NAME, "Group name is not specified"), \
C(GROUP_TOO_LONG, "Group name is too long"), \
@@ -506,7 +534,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(NO_BTFARG, "This variable is not found at this probe point"),\
C(NO_BTF_ENTRY, "No BTF entry for this probe point"), \
C(BAD_VAR_ARGS, "$arg* must be an independent parameter without name etc."),\
- C(NOFENTRY_ARGS, "$arg* can be used only on function entry"), \
+ C(NOFENTRY_ARGS, "$arg* can be used only on function entry or exit"), \
C(DOUBLE_ARGS, "$arg* can be used only once in the parameters"), \
C(ARGS_2LONG, "$arg* failed because the argument list is too long"), \
C(ARGIDX_2BIG, "$argN index is too big"), \
@@ -516,7 +544,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(NO_BTF_FIELD, "This field is not found."), \
C(BAD_BTF_TID, "Failed to get BTF type info."),\
C(BAD_TYPE4STR, "This type does not fit for string."),\
- C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),
+ C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),\
+ C(TOO_MANY_EARGS, "Too many entry arguments specified"),
#undef C
#define C(a, b) TP_ERR_##a
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index 3935b347f874..f39b37fcdb3b 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -54,7 +54,7 @@ fetch_apply_bitfield(struct fetch_insn *code, void *buf)
* If dest is NULL, don't store result and return required dynamic data size.
*/
static int
-process_fetch_insn(struct fetch_insn *code, void *rec,
+process_fetch_insn(struct fetch_insn *code, void *rec, void *edata,
void *dest, void *base);
static nokprobe_inline int fetch_store_strlen(unsigned long addr);
static nokprobe_inline int
@@ -232,7 +232,7 @@ array:
/* Sum up total data length for dynamic arrays (strings) */
static nokprobe_inline int
-__get_data_size(struct trace_probe *tp, struct pt_regs *regs)
+__get_data_size(struct trace_probe *tp, void *regs, void *edata)
{
struct probe_arg *arg;
int i, len, ret = 0;
@@ -240,7 +240,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
for (i = 0; i < tp->nr_args; i++) {
arg = tp->args + i;
if (unlikely(arg->dynamic)) {
- len = process_fetch_insn(arg->code, regs, NULL, NULL);
+ len = process_fetch_insn(arg->code, regs, edata, NULL, NULL);
if (len > 0)
ret += len;
}
@@ -251,7 +251,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
/* Store the value of each argument */
static nokprobe_inline void
-store_trace_args(void *data, struct trace_probe *tp, void *rec,
+store_trace_args(void *data, struct trace_probe *tp, void *rec, void *edata,
int header_size, int maxlen)
{
struct probe_arg *arg;
@@ -266,7 +266,7 @@ store_trace_args(void *data, struct trace_probe *tp, void *rec,
/* Point the dynamic data area if needed */
if (unlikely(arg->dynamic))
*dl = make_data_loc(maxlen, dyndata - base);
- ret = process_fetch_insn(arg->code, rec, dl, base);
+ ret = process_fetch_insn(arg->code, rec, edata, dl, base);
if (arg->dynamic && likely(ret > 0)) {
dyndata += ret;
maxlen -= ret;
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index c9ffdcfe622e..cb49f7279dc8 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/uaccess.h>
+#include <linux/kmemleak.h>
#include <linux/ftrace.h>
#include <trace/events/sched.h>
@@ -148,3 +149,517 @@ void tracing_stop_tgid_record(void)
{
tracing_stop_sched_switch(RECORD_TGID);
}
+
+/*
+ * The tgid_map array maps from pid to tgid; i.e. the value stored at index i
+ * is the tgid last observed corresponding to pid=i.
+ */
+static int *tgid_map;
+
+/* The maximum valid index into tgid_map. */
+static size_t tgid_map_max;
+
+#define SAVED_CMDLINES_DEFAULT 128
+#define NO_CMDLINE_MAP UINT_MAX
+/*
+ * Preemption must be disabled before acquiring trace_cmdline_lock.
+ * The various trace_arrays' max_lock must be acquired in a context
+ * where interrupt is disabled.
+ */
+static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
+struct saved_cmdlines_buffer {
+ unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
+ unsigned *map_cmdline_to_pid;
+ unsigned cmdline_num;
+ int cmdline_idx;
+ char saved_cmdlines[];
+};
+static struct saved_cmdlines_buffer *savedcmd;
+
+/* Holds the size of a cmdline and pid element */
+#define SAVED_CMDLINE_MAP_ELEMENT_SIZE(s) \
+ (TASK_COMM_LEN + sizeof((s)->map_cmdline_to_pid[0]))
+
+static inline char *get_saved_cmdlines(int idx)
+{
+ return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];
+}
+
+static inline void set_cmdline(int idx, const char *cmdline)
+{
+ strscpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
+}
+
+static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
+{
+ int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN);
+
+ kmemleak_free(s);
+ free_pages((unsigned long)s, order);
+}
+
+static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val)
+{
+ struct saved_cmdlines_buffer *s;
+ struct page *page;
+ int orig_size, size;
+ int order;
+
+ /* Figure out how much is needed to hold the given number of cmdlines */
+ orig_size = sizeof(*s) + val * SAVED_CMDLINE_MAP_ELEMENT_SIZE(s);
+ order = get_order(orig_size);
+ size = 1 << (order + PAGE_SHIFT);
+ page = alloc_pages(GFP_KERNEL, order);
+ if (!page)
+ return NULL;
+
+ s = page_address(page);
+ kmemleak_alloc(s, size, 1, GFP_KERNEL);
+ memset(s, 0, sizeof(*s));
+
+ /* Round up to actual allocation */
+ val = (size - sizeof(*s)) / SAVED_CMDLINE_MAP_ELEMENT_SIZE(s);
+ s->cmdline_num = val;
+
+ /* Place map_cmdline_to_pid array right after saved_cmdlines */
+ s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN];
+
+ s->cmdline_idx = 0;
+ memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
+ sizeof(s->map_pid_to_cmdline));
+ memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
+ val * sizeof(*s->map_cmdline_to_pid));
+
+ return s;
+}
+
+int trace_create_savedcmd(void)
+{
+ savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT);
+
+ return savedcmd ? 0 : -ENOMEM;
+}
+
+int trace_save_cmdline(struct task_struct *tsk)
+{
+ unsigned tpid, idx;
+
+ /* treat recording of idle task as a success */
+ if (!tsk->pid)
+ return 1;
+
+ tpid = tsk->pid & (PID_MAX_DEFAULT - 1);
+
+ /*
+ * It's not the end of the world if we don't get
+ * the lock, but we also don't want to spin
+ * nor do we want to disable interrupts,
+ * so if we miss here, then better luck next time.
+ *
+ * This is called within the scheduler and wake up, so interrupts
+ * had better been disabled and run queue lock been held.
+ */
+ lockdep_assert_preemption_disabled();
+ if (!arch_spin_trylock(&trace_cmdline_lock))
+ return 0;
+
+ idx = savedcmd->map_pid_to_cmdline[tpid];
+ if (idx == NO_CMDLINE_MAP) {
+ idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;
+
+ savedcmd->map_pid_to_cmdline[tpid] = idx;
+ savedcmd->cmdline_idx = idx;
+ }
+
+ savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
+ set_cmdline(idx, tsk->comm);
+
+ arch_spin_unlock(&trace_cmdline_lock);
+
+ return 1;
+}
+
+static void __trace_find_cmdline(int pid, char comm[])
+{
+ unsigned map;
+ int tpid;
+
+ if (!pid) {
+ strcpy(comm, "<idle>");
+ return;
+ }
+
+ if (WARN_ON_ONCE(pid < 0)) {
+ strcpy(comm, "<XXX>");
+ return;
+ }
+
+ tpid = pid & (PID_MAX_DEFAULT - 1);
+ map = savedcmd->map_pid_to_cmdline[tpid];
+ if (map != NO_CMDLINE_MAP) {
+ tpid = savedcmd->map_cmdline_to_pid[map];
+ if (tpid == pid) {
+ strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
+ return;
+ }
+ }
+ strcpy(comm, "<...>");
+}
+
+void trace_find_cmdline(int pid, char comm[])
+{
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+
+ __trace_find_cmdline(pid, comm);
+
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+}
+
+static int *trace_find_tgid_ptr(int pid)
+{
+ /*
+ * Pairs with the smp_store_release in set_tracer_flag() to ensure that
+ * if we observe a non-NULL tgid_map then we also observe the correct
+ * tgid_map_max.
+ */
+ int *map = smp_load_acquire(&tgid_map);
+
+ if (unlikely(!map || pid > tgid_map_max))
+ return NULL;
+
+ return &map[pid];
+}
+
+int trace_find_tgid(int pid)
+{
+ int *ptr = trace_find_tgid_ptr(pid);
+
+ return ptr ? *ptr : 0;
+}
+
+static int trace_save_tgid(struct task_struct *tsk)
+{
+ int *ptr;
+
+ /* treat recording of idle task as a success */
+ if (!tsk->pid)
+ return 1;
+
+ ptr = trace_find_tgid_ptr(tsk->pid);
+ if (!ptr)
+ return 0;
+
+ *ptr = tsk->tgid;
+ return 1;
+}
+
+static bool tracing_record_taskinfo_skip(int flags)
+{
+ if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID))))
+ return true;
+ if (!__this_cpu_read(trace_taskinfo_save))
+ return true;
+ return false;
+}
+
+/**
+ * tracing_record_taskinfo - record the task info of a task
+ *
+ * @task: task to record
+ * @flags: TRACE_RECORD_CMDLINE for recording comm
+ * TRACE_RECORD_TGID for recording tgid
+ */
+void tracing_record_taskinfo(struct task_struct *task, int flags)
+{
+ bool done;
+
+ if (tracing_record_taskinfo_skip(flags))
+ return;
+
+ /*
+ * Record as much task information as possible. If some fail, continue
+ * to try to record the others.
+ */
+ done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task);
+ done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task);
+
+ /* If recording any information failed, retry again soon. */
+ if (!done)
+ return;
+
+ __this_cpu_write(trace_taskinfo_save, false);
+}
+
+/**
+ * tracing_record_taskinfo_sched_switch - record task info for sched_switch
+ *
+ * @prev: previous task during sched_switch
+ * @next: next task during sched_switch
+ * @flags: TRACE_RECORD_CMDLINE for recording comm
+ * TRACE_RECORD_TGID for recording tgid
+ */
+void tracing_record_taskinfo_sched_switch(struct task_struct *prev,
+ struct task_struct *next, int flags)
+{
+ bool done;
+
+ if (tracing_record_taskinfo_skip(flags))
+ return;
+
+ /*
+ * Record as much task information as possible. If some fail, continue
+ * to try to record the others.
+ */
+ done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev);
+ done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next);
+ done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev);
+ done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next);
+
+ /* If recording any information failed, retry again soon. */
+ if (!done)
+ return;
+
+ __this_cpu_write(trace_taskinfo_save, false);
+}
+
+/* Helpers to record a specific task information */
+void tracing_record_cmdline(struct task_struct *task)
+{
+ tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE);
+}
+
+void tracing_record_tgid(struct task_struct *task)
+{
+ tracing_record_taskinfo(task, TRACE_RECORD_TGID);
+}
+
+int trace_alloc_tgid_map(void)
+{
+ int *map;
+
+ if (tgid_map)
+ return 0;
+
+ tgid_map_max = init_pid_ns.pid_max;
+ map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
+ GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+
+ /*
+ * Pairs with smp_load_acquire() in
+ * trace_find_tgid_ptr() to ensure that if it observes
+ * the tgid_map we just allocated then it also observes
+ * the corresponding tgid_map_max value.
+ */
+ smp_store_release(&tgid_map, map);
+ return 0;
+}
+
+static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ int pid = ++(*pos);
+
+ return trace_find_tgid_ptr(pid);
+}
+
+static void *saved_tgids_start(struct seq_file *m, loff_t *pos)
+{
+ int pid = *pos;
+
+ return trace_find_tgid_ptr(pid);
+}
+
+static void saved_tgids_stop(struct seq_file *m, void *v)
+{
+}
+
+static int saved_tgids_show(struct seq_file *m, void *v)
+{
+ int *entry = (int *)v;
+ int pid = entry - tgid_map;
+ int tgid = *entry;
+
+ if (tgid == 0)
+ return SEQ_SKIP;
+
+ seq_printf(m, "%d %d\n", pid, tgid);
+ return 0;
+}
+
+static const struct seq_operations tracing_saved_tgids_seq_ops = {
+ .start = saved_tgids_start,
+ .stop = saved_tgids_stop,
+ .next = saved_tgids_next,
+ .show = saved_tgids_show,
+};
+
+static int tracing_saved_tgids_open(struct inode *inode, struct file *filp)
+{
+ int ret;
+
+ ret = tracing_check_open_get_tr(NULL);
+ if (ret)
+ return ret;
+
+ return seq_open(filp, &tracing_saved_tgids_seq_ops);
+}
+
+
+const struct file_operations tracing_saved_tgids_fops = {
+ .open = tracing_saved_tgids_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ unsigned int *ptr = v;
+
+ if (*pos || m->count)
+ ptr++;
+
+ (*pos)++;
+
+ for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num];
+ ptr++) {
+ if (*ptr == -1 || *ptr == NO_CMDLINE_MAP)
+ continue;
+
+ return ptr;
+ }
+
+ return NULL;
+}
+
+static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos)
+{
+ void *v;
+ loff_t l = 0;
+
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+
+ v = &savedcmd->map_cmdline_to_pid[0];
+ while (l <= *pos) {
+ v = saved_cmdlines_next(m, v, &l);
+ if (!v)
+ return NULL;
+ }
+
+ return v;
+}
+
+static void saved_cmdlines_stop(struct seq_file *m, void *v)
+{
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+}
+
+static int saved_cmdlines_show(struct seq_file *m, void *v)
+{
+ char buf[TASK_COMM_LEN];
+ unsigned int *pid = v;
+
+ __trace_find_cmdline(*pid, buf);
+ seq_printf(m, "%d %s\n", *pid, buf);
+ return 0;
+}
+
+static const struct seq_operations tracing_saved_cmdlines_seq_ops = {
+ .start = saved_cmdlines_start,
+ .next = saved_cmdlines_next,
+ .stop = saved_cmdlines_stop,
+ .show = saved_cmdlines_show,
+};
+
+static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp)
+{
+ int ret;
+
+ ret = tracing_check_open_get_tr(NULL);
+ if (ret)
+ return ret;
+
+ return seq_open(filp, &tracing_saved_cmdlines_seq_ops);
+}
+
+const struct file_operations tracing_saved_cmdlines_fops = {
+ .open = tracing_saved_cmdlines_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static ssize_t
+tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ int r;
+
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+ r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+void trace_free_saved_cmdlines_buffer(void)
+{
+ free_saved_cmdlines_buffer(savedcmd);
+}
+
+static int tracing_resize_saved_cmdlines(unsigned int val)
+{
+ struct saved_cmdlines_buffer *s, *savedcmd_temp;
+
+ s = allocate_cmdlines_buffer(val);
+ if (!s)
+ return -ENOMEM;
+
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+ savedcmd_temp = savedcmd;
+ savedcmd = s;
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+ free_saved_cmdlines_buffer(savedcmd_temp);
+
+ return 0;
+}
+
+static ssize_t
+tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ /* must have at least 1 entry or less than PID_MAX_DEFAULT */
+ if (!val || val > PID_MAX_DEFAULT)
+ return -EINVAL;
+
+ ret = tracing_resize_saved_cmdlines((unsigned int)val);
+ if (ret < 0)
+ return ret;
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+const struct file_operations tracing_saved_cmdlines_size_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_saved_cmdlines_size_read,
+ .write = tracing_saved_cmdlines_size_write,
+};
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0469a04a355f..af30586f1aea 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -112,14 +112,17 @@ static int wakeup_display_graph(struct trace_array *tr, int set)
return start_func_tracer(tr, set);
}
-static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
unsigned int trace_ctx;
+ u64 *calltime;
int ret = 0;
- if (ftrace_graph_ignore_func(trace))
+ if (ftrace_graph_ignore_func(gops, trace))
return 0;
/*
* Do not trace a function if it's filtered by set_graph_notrace.
@@ -134,6 +137,12 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
return 0;
+ calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime));
+ if (!calltime)
+ return 0;
+
+ *calltime = trace_clock_local();
+
ret = __trace_graph_entry(tr, trace, trace_ctx);
atomic_dec(&data->disabled);
preempt_enable_notrace();
@@ -141,18 +150,29 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
return ret;
}
-static void wakeup_graph_return(struct ftrace_graph_ret *trace)
+static void wakeup_graph_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
unsigned int trace_ctx;
+ u64 *calltime;
+ u64 rettime;
+ int size;
- ftrace_graph_addr_finish(trace);
+ ftrace_graph_addr_finish(gops, trace);
if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
return;
- __trace_graph_return(tr, trace, trace_ctx);
+ rettime = trace_clock_local();
+
+ calltime = fgraph_retrieve_data(gops->idx, &size);
+ if (!calltime)
+ return;
+
+ __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime);
atomic_dec(&data->disabled);
preempt_enable_notrace();
@@ -376,7 +396,6 @@ tracing_sched_switch_trace(struct trace_array *tr,
struct task_struct *next,
unsigned int trace_ctx)
{
- struct trace_event_call *call = &event_context_switch;
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct ctx_switch_entry *entry;
@@ -394,8 +413,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
entry->next_state = task_state_index(next);
entry->next_cpu = task_cpu(next);
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
+ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
}
static void
@@ -404,7 +422,6 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
struct task_struct *curr,
unsigned int trace_ctx)
{
- struct trace_event_call *call = &event_wakeup;
struct ring_buffer_event *event;
struct ctx_switch_entry *entry;
struct trace_buffer *buffer = tr->array_buffer.buffer;
@@ -422,8 +439,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
entry->next_state = task_state_index(wakee);
entry->next_cpu = task_cpu(wakee);
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
+ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
}
static void notrace
@@ -545,7 +561,7 @@ probe_wakeup(void *ignore, struct task_struct *p)
* - wakeup_dl handles tasks belonging to sched_dl class only.
*/
if (tracing_dl || (wakeup_dl && !dl_task(p)) ||
- (wakeup_rt && !dl_task(p) && !rt_task(p)) ||
+ (wakeup_rt && !rt_or_dl_task(p)) ||
(!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio)))
return;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 529590499b1f..d88c44f1dfa5 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
case TRACE_PRINT:
case TRACE_BRANCH:
case TRACE_GRAPH_ENT:
+ case TRACE_GRAPH_RETADDR_ENT:
case TRACE_GRAPH_RET:
return 1;
}
@@ -756,19 +757,284 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+#define CHAR_NUMBER 123
+#define SHORT_NUMBER 12345
+#define WORD_NUMBER 1234567890
+#define LONG_NUMBER 1234567890123456789LL
+#define ERRSTR_BUFLEN 128
+
+struct fgraph_fixture {
+ struct fgraph_ops gops;
+ int store_size;
+ const char *store_type_name;
+ char error_str_buf[ERRSTR_BUFLEN];
+ char *error_str;
+};
+
+static __init int store_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
+{
+ struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops);
+ const char *type = fixture->store_type_name;
+ int size = fixture->store_size;
+ void *p;
+
+ p = fgraph_reserve_data(gops->idx, size);
+ if (!p) {
+ snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
+ "Failed to reserve %s\n", type);
+ return 0;
+ }
+
+ switch (size) {
+ case 1:
+ *(char *)p = CHAR_NUMBER;
+ break;
+ case 2:
+ *(short *)p = SHORT_NUMBER;
+ break;
+ case 4:
+ *(int *)p = WORD_NUMBER;
+ break;
+ case 8:
+ *(long long *)p = LONG_NUMBER;
+ break;
+ }
+
+ return 1;
+}
+
+static __init void store_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
+{
+ struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops);
+ const char *type = fixture->store_type_name;
+ long long expect = 0;
+ long long found = -1;
+ int size;
+ char *p;
+
+ p = fgraph_retrieve_data(gops->idx, &size);
+ if (!p) {
+ snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
+ "Failed to retrieve %s\n", type);
+ return;
+ }
+ if (fixture->store_size > size) {
+ snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
+ "Retrieved size %d is smaller than expected %d\n",
+ size, (int)fixture->store_size);
+ return;
+ }
+
+ switch (fixture->store_size) {
+ case 1:
+ expect = CHAR_NUMBER;
+ found = *(char *)p;
+ break;
+ case 2:
+ expect = SHORT_NUMBER;
+ found = *(short *)p;
+ break;
+ case 4:
+ expect = WORD_NUMBER;
+ found = *(int *)p;
+ break;
+ case 8:
+ expect = LONG_NUMBER;
+ found = *(long long *)p;
+ break;
+ }
+
+ if (found != expect) {
+ snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
+ "%s returned not %lld but %lld\n", type, expect, found);
+ return;
+ }
+ fixture->error_str = NULL;
+}
+
+static int __init init_fgraph_fixture(struct fgraph_fixture *fixture)
+{
+ char *func_name;
+ int len;
+
+ snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
+ "Failed to execute storage %s\n", fixture->store_type_name);
+ fixture->error_str = fixture->error_str_buf;
+
+ func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
+ len = strlen(func_name);
+
+ return ftrace_set_filter(&fixture->gops.ops, func_name, len, 1);
+}
+
+/* Test fgraph storage for each size */
+static int __init test_graph_storage_single(struct fgraph_fixture *fixture)
+{
+ int size = fixture->store_size;
+ int ret;
+
+ pr_cont("PASSED\n");
+ pr_info("Testing fgraph storage of %d byte%s: ", size, str_plural(size));
+
+ ret = init_fgraph_fixture(fixture);
+ if (ret && ret != -ENODEV) {
+ pr_cont("*Could not set filter* ");
+ return -1;
+ }
+
+ ret = register_ftrace_graph(&fixture->gops);
+ if (ret) {
+ pr_warn("Failed to init store_bytes fgraph tracing\n");
+ return -1;
+ }
+
+ DYN_FTRACE_TEST_NAME();
+
+ unregister_ftrace_graph(&fixture->gops);
+
+ if (fixture->error_str) {
+ pr_cont("*** %s ***", fixture->error_str);
+ return -1;
+ }
+
+ return 0;
+}
+
+static struct fgraph_fixture store_bytes[4] __initdata = {
+ [0] = {
+ .gops = {
+ .entryfunc = store_entry,
+ .retfunc = store_return,
+ },
+ .store_size = 1,
+ .store_type_name = "byte",
+ },
+ [1] = {
+ .gops = {
+ .entryfunc = store_entry,
+ .retfunc = store_return,
+ },
+ .store_size = 2,
+ .store_type_name = "short",
+ },
+ [2] = {
+ .gops = {
+ .entryfunc = store_entry,
+ .retfunc = store_return,
+ },
+ .store_size = 4,
+ .store_type_name = "word",
+ },
+ [3] = {
+ .gops = {
+ .entryfunc = store_entry,
+ .retfunc = store_return,
+ },
+ .store_size = 8,
+ .store_type_name = "long long",
+ },
+};
+
+static __init int test_graph_storage_multi(void)
+{
+ struct fgraph_fixture *fixture;
+ bool printed = false;
+ int i, j, ret;
+
+ pr_cont("PASSED\n");
+ pr_info("Testing multiple fgraph storage on a function: ");
+
+ for (i = 0; i < ARRAY_SIZE(store_bytes); i++) {
+ fixture = &store_bytes[i];
+ ret = init_fgraph_fixture(fixture);
+ if (ret && ret != -ENODEV) {
+ pr_cont("*Could not set filter* ");
+ printed = true;
+ goto out2;
+ }
+ }
+
+ for (j = 0; j < ARRAY_SIZE(store_bytes); j++) {
+ fixture = &store_bytes[j];
+ ret = register_ftrace_graph(&fixture->gops);
+ if (ret) {
+ pr_warn("Failed to init store_bytes fgraph tracing\n");
+ printed = true;
+ goto out1;
+ }
+ }
+
+ DYN_FTRACE_TEST_NAME();
+out1:
+ while (--j >= 0) {
+ fixture = &store_bytes[j];
+ unregister_ftrace_graph(&fixture->gops);
+
+ if (fixture->error_str && !printed) {
+ pr_cont("*** %s ***", fixture->error_str);
+ printed = true;
+ }
+ }
+out2:
+ while (--i >= 0) {
+ fixture = &store_bytes[i];
+ ftrace_free_filter(&fixture->gops.ops);
+
+ if (fixture->error_str && !printed) {
+ pr_cont("*** %s ***", fixture->error_str);
+ printed = true;
+ }
+ }
+ return printed ? -1 : 0;
+}
+
+/* Test the storage passed across function_graph entry and return */
+static __init int test_graph_storage(void)
+{
+ int ret;
+
+ ret = test_graph_storage_single(&store_bytes[0]);
+ if (ret)
+ return ret;
+ ret = test_graph_storage_single(&store_bytes[1]);
+ if (ret)
+ return ret;
+ ret = test_graph_storage_single(&store_bytes[2]);
+ if (ret)
+ return ret;
+ ret = test_graph_storage_single(&store_bytes[3]);
+ if (ret)
+ return ret;
+ ret = test_graph_storage_multi();
+ if (ret)
+ return ret;
+ return 0;
+}
+#else
+static inline int test_graph_storage(void) { return 0; }
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
/* Maximum number of functions to trace before diagnosing a hang */
#define GRAPH_MAX_FUNC_TEST 100000000
static unsigned int graph_hang_thresh;
/* Wrap the real function entry probe to avoid possible hanging */
-static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
+static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
/* This is harmlessly racy, we want to approximately detect a hang */
if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
ftrace_graph_stop();
printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
- if (ftrace_dump_on_oops) {
+ if (ftrace_dump_on_oops_enabled()) {
ftrace_dump(DUMP_ALL);
/* ftrace_dump() disables tracing */
tracing_on();
@@ -776,7 +1042,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
return 0;
}
- return trace_graph_entry(trace);
+ return trace_graph_entry(trace, gops, fregs);
}
static struct fgraph_ops fgraph_ops __initdata = {
@@ -812,7 +1078,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
* to detect and recover from possible hangs
*/
tracing_reset_online_cpus(&tr->array_buffer);
- set_graph_array(tr);
+ fgraph_ops.private = tr;
ret = register_ftrace_graph(&fgraph_ops);
if (ret) {
warn_failed_init_tracer(trace, ret);
@@ -855,7 +1121,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
cond_resched();
tracing_reset_online_cpus(&tr->array_buffer);
- set_graph_array(tr);
+ fgraph_ops.private = tr;
/*
* Some archs *cough*PowerPC*cough* add characters to the
@@ -912,6 +1178,8 @@ trace_selftest_startup_function_graph(struct tracer *trace,
ftrace_set_global_filter(NULL, 0, 1);
#endif
+ ret = test_graph_storage();
+
/* Don't test dynamic tracing, the function tracer already did */
out:
/* Stop it if we failed */
@@ -1221,7 +1489,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
/* reset the max latency */
tr->max_latency = 0;
- while (p->on_rq) {
+ while (task_is_runnable(p)) {
/*
* Sleep to make sure the -deadline thread is asleep too.
* On virtual machines we can't rely on timings,
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 5a48dba912ea..14c6f272c4d8 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -514,26 +514,24 @@ static const struct file_operations stack_trace_filter_fops = {
#endif /* CONFIG_DYNAMIC_FTRACE */
int
-stack_trace_sysctl(struct ctl_table *table, int write, void *buffer,
+stack_trace_sysctl(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int was_enabled;
int ret;
- mutex_lock(&stack_sysctl_mutex);
+ guard(mutex)(&stack_sysctl_mutex);
was_enabled = !!stack_tracer_enabled;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret || !write || (was_enabled == !!stack_tracer_enabled))
- goto out;
+ return ret;
if (stack_tracer_enabled)
register_ftrace_function(&trace_ops);
else
unregister_ftrace_function(&trace_ops);
- out:
- mutex_unlock(&stack_sysctl_mutex);
return ret;
}
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index bb247beec447..b3b5586f104d 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -128,7 +128,7 @@ static int stat_seq_init(struct stat_session *session)
int ret = 0;
int i;
- mutex_lock(&session->stat_mutex);
+ guard(mutex)(&session->stat_mutex);
__reset_stat_session(session);
if (!ts->stat_cmp)
@@ -136,11 +136,11 @@ static int stat_seq_init(struct stat_session *session)
stat = ts->stat_start(ts);
if (!stat)
- goto exit;
+ return 0;
ret = insert_stat(root, stat, ts->stat_cmp);
if (ret)
- goto exit;
+ return ret;
/*
* Iterate over the tracer stat entries and store them in an rbtree.
@@ -157,13 +157,10 @@ static int stat_seq_init(struct stat_session *session)
goto exit_free_rbtree;
}
-exit:
- mutex_unlock(&session->stat_mutex);
return ret;
exit_free_rbtree:
__reset_stat_session(session);
- mutex_unlock(&session->stat_mutex);
return ret;
}
@@ -308,7 +305,7 @@ static int init_stat_file(struct stat_session *session)
int register_stat_tracer(struct tracer_stat *trace)
{
struct stat_session *session, *node;
- int ret = -EINVAL;
+ int ret;
if (!trace)
return -EINVAL;
@@ -316,18 +313,18 @@ int register_stat_tracer(struct tracer_stat *trace)
if (!trace->stat_start || !trace->stat_next || !trace->stat_show)
return -EINVAL;
+ guard(mutex)(&all_stat_sessions_mutex);
+
/* Already registered? */
- mutex_lock(&all_stat_sessions_mutex);
list_for_each_entry(node, &all_stat_sessions, session_list) {
if (node->ts == trace)
- goto out;
+ return -EINVAL;
}
- ret = -ENOMEM;
/* Init the session */
session = kzalloc(sizeof(*session), GFP_KERNEL);
if (!session)
- goto out;
+ return -ENOMEM;
session->ts = trace;
INIT_LIST_HEAD(&session->session_list);
@@ -336,16 +333,13 @@ int register_stat_tracer(struct tracer_stat *trace)
ret = init_stat_file(session);
if (ret) {
destroy_session(session);
- goto out;
+ return ret;
}
- ret = 0;
/* Register */
list_add_tail(&session->session_list, &all_stat_sessions);
- out:
- mutex_unlock(&all_stat_sessions_mutex);
- return ret;
+ return 0;
}
void unregister_stat_tracer(struct tracer_stat *trace)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 9c581d6da843..46aab0ab9350 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -299,6 +299,13 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
int syscall_nr;
int size;
+ /*
+ * Syscall probe called with preemption enabled, but the ring
+ * buffer and per-cpu data require preemption to be disabled.
+ */
+ might_fault();
+ guard(preempt_notrace)();
+
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
@@ -338,6 +345,13 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
struct trace_event_buffer fbuffer;
int syscall_nr;
+ /*
+ * Syscall probe called with preemption enabled, but the ring
+ * buffer and per-cpu data require preemption to be disabled.
+ */
+ might_fault();
+ guard(preempt_notrace)();
+
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
@@ -564,6 +578,7 @@ static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *re
BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
+ perf_fetch_caller_regs(regs);
*(struct pt_regs **)&param = regs;
param.syscall_nr = rec->nr;
for (i = 0; i < sys_data->nb_args; i++)
@@ -575,6 +590,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
+ struct pt_regs *fake_regs;
struct hlist_head *head;
unsigned long args[6];
bool valid_prog_array;
@@ -582,6 +598,13 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
int rctx;
int size;
+ /*
+ * Syscall probe called with preemption enabled, but the ring
+ * buffer and per-cpu data require preemption to be disabled.
+ */
+ might_fault();
+ guard(preempt_notrace)();
+
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
@@ -602,7 +625,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
size = ALIGN(size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- rec = perf_trace_buf_alloc(size, NULL, &rctx);
+ rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
if (!rec)
return;
@@ -611,7 +634,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
if ((valid_prog_array &&
- !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) ||
+ !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
@@ -666,6 +689,7 @@ static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *reg
} __aligned(8) param;
/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
+ perf_fetch_caller_regs(regs);
*(struct pt_regs **)&param = regs;
param.syscall_nr = rec->nr;
param.ret = rec->ret;
@@ -676,12 +700,20 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
+ struct pt_regs *fake_regs;
struct hlist_head *head;
bool valid_prog_array;
int syscall_nr;
int rctx;
int size;
+ /*
+ * Syscall probe called with preemption enabled, but the ring
+ * buffer and per-cpu data require preemption to be disabled.
+ */
+ might_fault();
+ guard(preempt_notrace)();
+
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
@@ -701,7 +733,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- rec = perf_trace_buf_alloc(size, NULL, &rctx);
+ rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
if (!rec)
return;
@@ -709,7 +741,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
rec->ret = syscall_get_return_value(current, regs);
if ((valid_prog_array &&
- !perf_call_bpf_exit(sys_data->exit_event, regs, rec)) ||
+ !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) ||
hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index a84b85d8aac1..ccc762fbb69c 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -17,6 +17,7 @@
#include <linux/string.h>
#include <linux/rculist.h>
#include <linux/filter.h>
+#include <linux/percpu.h>
#include "trace_dynevent.h"
#include "trace_probe.h"
@@ -58,11 +59,11 @@ struct trace_uprobe {
struct dyn_event devent;
struct uprobe_consumer consumer;
struct path path;
- struct inode *inode;
char *filename;
+ struct uprobe *uprobe;
unsigned long offset;
unsigned long ref_ctr_offset;
- unsigned long nhit;
+ unsigned long __percpu *nhits;
struct trace_probe tp;
};
@@ -88,9 +89,11 @@ static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev)
static int register_uprobe_event(struct trace_uprobe *tu);
static int unregister_uprobe_event(struct trace_uprobe *tu);
-static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
+static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs,
+ __u64 *data);
static int uretprobe_dispatcher(struct uprobe_consumer *con,
- unsigned long func, struct pt_regs *regs);
+ unsigned long func, struct pt_regs *regs,
+ __u64 *data);
#ifdef CONFIG_STACK_GROWSUP
static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n)
@@ -211,8 +214,8 @@ static unsigned long translate_user_vaddr(unsigned long file_offset)
/* Note that we don't verify it, since the code does not come from user space */
static int
-process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,
- void *base)
+process_fetch_insn(struct fetch_insn *code, void *rec, void *edata,
+ void *dest, void *base)
{
struct pt_regs *regs = rec;
unsigned long val;
@@ -337,7 +340,13 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
if (!tu)
return ERR_PTR(-ENOMEM);
- ret = trace_probe_init(&tu->tp, event, group, true);
+ tu->nhits = alloc_percpu(unsigned long);
+ if (!tu->nhits) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ ret = trace_probe_init(&tu->tp, event, group, true, nargs);
if (ret < 0)
goto error;
@@ -349,6 +358,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
return tu;
error:
+ free_percpu(tu->nhits);
kfree(tu);
return ERR_PTR(ret);
@@ -362,6 +372,7 @@ static void free_trace_uprobe(struct trace_uprobe *tu)
path_put(&tu->path);
trace_probe_cleanup(&tu->tp);
kfree(tu->filename);
+ free_percpu(tu->nhits);
kfree(tu);
}
@@ -487,11 +498,11 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
struct trace_uprobe *old_tu;
int ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
ret = validate_ref_ctr_offset(tu);
if (ret)
- goto end;
+ return ret;
/* register as an event */
old_tu = find_probe_event(trace_probe_name(&tu->tp),
@@ -500,11 +511,9 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
if (is_ret_probe(tu) != is_ret_probe(old_tu)) {
trace_probe_log_set_index(0);
trace_probe_log_err(0, DIFF_PROBE_TYPE);
- ret = -EEXIST;
- } else {
- ret = append_trace_uprobe(tu, old_tu);
+ return -EEXIST;
}
- goto end;
+ return append_trace_uprobe(tu, old_tu);
}
ret = register_uprobe_event(tu);
@@ -514,14 +523,11 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
trace_probe_log_err(0, EVENT_EXIST);
} else
pr_warn("Failed to register probe event(%d)\n", ret);
- goto end;
+ return ret;
}
dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp));
-end:
- mutex_unlock(&event_mutex);
-
return ret;
}
@@ -556,6 +562,8 @@ static int __trace_uprobe_create(int argc, const char **argv)
if (argc < 2)
return -ECANCELED;
+ if (argc - 2 > MAX_TRACE_ARGS)
+ return -E2BIG;
if (argv[0][1] == ':')
event = &argv[0][2];
@@ -681,7 +689,7 @@ static int __trace_uprobe_create(int argc, const char **argv)
tu->filename = filename;
/* parse arguments */
- for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ for (i = 0; i < argc; i++) {
struct traceprobe_parse_context ctx = {
.flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER,
};
@@ -815,13 +823,21 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
{
struct dyn_event *ev = v;
struct trace_uprobe *tu;
+ unsigned long nhits;
+ int cpu;
if (!is_trace_uprobe(ev))
return 0;
tu = to_trace_uprobe(ev);
+
+ nhits = 0;
+ for_each_possible_cpu(cpu) {
+ nhits += per_cpu(*tu->nhits, cpu);
+ }
+
seq_printf(m, " %s %-44s %15lu\n", tu->filename,
- trace_probe_name(&tu->tp), tu->nhit);
+ trace_probe_name(&tu->tp), nhits);
return 0;
}
@@ -854,9 +870,11 @@ static const struct file_operations uprobe_profile_ops = {
struct uprobe_cpu_buffer {
struct mutex mutex;
void *buf;
+ int dsize;
};
static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer;
static int uprobe_buffer_refcnt;
+#define MAX_UCB_BUFFER_SIZE PAGE_SIZE
static int uprobe_buffer_init(void)
{
@@ -940,12 +958,41 @@ static struct uprobe_cpu_buffer *uprobe_buffer_get(void)
static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb)
{
+ if (!ucb)
+ return;
mutex_unlock(&ucb->mutex);
}
+static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu,
+ struct pt_regs *regs,
+ struct uprobe_cpu_buffer **ucbp)
+{
+ struct uprobe_cpu_buffer *ucb;
+ int dsize, esize;
+
+ if (*ucbp)
+ return *ucbp;
+
+ esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
+ dsize = __get_data_size(&tu->tp, regs, NULL);
+
+ ucb = uprobe_buffer_get();
+ ucb->dsize = tu->tp.size + dsize;
+
+ if (WARN_ON_ONCE(ucb->dsize > MAX_UCB_BUFFER_SIZE)) {
+ ucb->dsize = MAX_UCB_BUFFER_SIZE;
+ dsize = MAX_UCB_BUFFER_SIZE - tu->tp.size;
+ }
+
+ store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize);
+
+ *ucbp = ucb;
+ return ucb;
+}
+
static void __uprobe_trace_func(struct trace_uprobe *tu,
unsigned long func, struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb, int dsize,
+ struct uprobe_cpu_buffer *ucb,
struct trace_event_file *trace_file)
{
struct uprobe_trace_entry_head *entry;
@@ -956,14 +1003,11 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
WARN_ON(call != trace_file->event_call);
- if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE))
- return;
-
if (trace_trigger_soft_disabled(trace_file))
return;
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
- size = esize + tu->tp.size + dsize;
+ size = esize + ucb->dsize;
entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
if (!entry)
return;
@@ -977,23 +1021,26 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
data = DATAOF_TRACE_ENTRY(entry, false);
}
- memcpy(data, ucb->buf, tu->tp.size + dsize);
+ memcpy(data, ucb->buf, ucb->dsize);
trace_event_buffer_commit(&fbuffer);
}
/* uprobe handler */
static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb, int dsize)
+ struct uprobe_cpu_buffer **ucbp)
{
struct event_file_link *link;
+ struct uprobe_cpu_buffer *ucb;
if (is_ret_probe(tu))
return 0;
+ ucb = prepare_uprobe_buffer(tu, regs, ucbp);
+
rcu_read_lock();
trace_probe_for_each_link_rcu(link, &tu->tp)
- __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file);
+ __uprobe_trace_func(tu, 0, regs, ucb, link->file);
rcu_read_unlock();
return 0;
@@ -1001,13 +1048,16 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb, int dsize)
+ struct uprobe_cpu_buffer **ucbp)
{
struct event_file_link *link;
+ struct uprobe_cpu_buffer *ucb;
+
+ ucb = prepare_uprobe_buffer(tu, regs, ucbp);
rcu_read_lock();
trace_probe_for_each_link_rcu(link, &tu->tp)
- __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file);
+ __uprobe_trace_func(tu, func, regs, ucb, link->file);
rcu_read_unlock();
}
@@ -1047,43 +1097,40 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
return trace_handle_return(s);
}
-typedef bool (*filter_func_t)(struct uprobe_consumer *self,
- enum uprobe_filter_ctx ctx,
- struct mm_struct *mm);
+typedef bool (*filter_func_t)(struct uprobe_consumer *self, struct mm_struct *mm);
static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter)
{
- int ret;
+ struct inode *inode = d_real_inode(tu->path.dentry);
+ struct uprobe *uprobe;
tu->consumer.filter = filter;
- tu->inode = d_real_inode(tu->path.dentry);
+ uprobe = uprobe_register(inode, tu->offset, tu->ref_ctr_offset, &tu->consumer);
+ if (IS_ERR(uprobe))
+ return PTR_ERR(uprobe);
- if (tu->ref_ctr_offset)
- ret = uprobe_register_refctr(tu->inode, tu->offset,
- tu->ref_ctr_offset, &tu->consumer);
- else
- ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
-
- if (ret)
- tu->inode = NULL;
-
- return ret;
+ tu->uprobe = uprobe;
+ return 0;
}
static void __probe_event_disable(struct trace_probe *tp)
{
struct trace_uprobe *tu;
+ bool sync = false;
tu = container_of(tp, struct trace_uprobe, tp);
WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter));
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
- if (!tu->inode)
+ if (!tu->uprobe)
continue;
- uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
- tu->inode = NULL;
+ uprobe_unregister_nosync(tu->uprobe, &tu->consumer);
+ sync = true;
+ tu->uprobe = NULL;
}
+ if (sync)
+ uprobe_unregister_sync();
}
static int probe_event_enable(struct trace_event_call *call,
@@ -1199,9 +1246,6 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
{
struct perf_event *event;
- if (filter->nr_systemwide)
- return true;
-
list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
if (event->hw.target->mm == mm)
return true;
@@ -1282,7 +1326,7 @@ static int uprobe_perf_close(struct trace_event_call *call,
return 0;
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
- ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
+ ret = uprobe_apply(tu->uprobe, &tu->consumer, false);
if (ret)
break;
}
@@ -1306,7 +1350,7 @@ static int uprobe_perf_open(struct trace_event_call *call,
return 0;
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
- err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
+ err = uprobe_apply(tu->uprobe, &tu->consumer, true);
if (err) {
uprobe_perf_close(call, event);
break;
@@ -1316,8 +1360,7 @@ static int uprobe_perf_open(struct trace_event_call *call,
return err;
}
-static bool uprobe_perf_filter(struct uprobe_consumer *uc,
- enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+static bool uprobe_perf_filter(struct uprobe_consumer *uc, struct mm_struct *mm)
{
struct trace_uprobe_filter *filter;
struct trace_uprobe *tu;
@@ -1326,6 +1369,13 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
tu = container_of(uc, struct trace_uprobe, consumer);
filter = tu->tp.event->filter;
+ /*
+ * speculative short-circuiting check to avoid unnecessarily taking
+ * filter->rwlock below, if the uprobe has system-wide consumer
+ */
+ if (READ_ONCE(filter->nr_systemwide))
+ return true;
+
read_lock(&filter->rwlock);
ret = __uprobe_perf_filter(filter, mm);
read_unlock(&filter->rwlock);
@@ -1335,10 +1385,11 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
static void __uprobe_perf_func(struct trace_uprobe *tu,
unsigned long func, struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb, int dsize)
+ struct uprobe_cpu_buffer **ucbp)
{
struct trace_event_call *call = trace_probe_event_call(&tu->tp);
struct uprobe_trace_entry_head *entry;
+ struct uprobe_cpu_buffer *ucb;
struct hlist_head *head;
void *data;
int size, esize;
@@ -1346,9 +1397,13 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
#ifdef CONFIG_BPF_EVENTS
if (bpf_prog_array_valid(call)) {
+ const struct bpf_prog_array *array;
u32 ret;
- ret = bpf_prog_run_array_uprobe(call->prog_array, regs, bpf_prog_run);
+ rcu_read_lock_trace();
+ array = rcu_dereference_check(call->prog_array, rcu_read_lock_trace_held());
+ ret = bpf_prog_run_array_uprobe(array, regs, bpf_prog_run);
+ rcu_read_unlock_trace();
if (!ret)
return;
}
@@ -1356,7 +1411,8 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
- size = esize + tu->tp.size + dsize;
+ ucb = prepare_uprobe_buffer(tu, regs, ucbp);
+ size = esize + ucb->dsize;
size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32);
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
return;
@@ -1379,13 +1435,10 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
data = DATAOF_TRACE_ENTRY(entry, false);
}
- memcpy(data, ucb->buf, tu->tp.size + dsize);
+ memcpy(data, ucb->buf, ucb->dsize);
- if (size - esize > tu->tp.size + dsize) {
- int len = tu->tp.size + dsize;
-
- memset(data + len, 0, size - esize - len);
- }
+ if (size - esize > ucb->dsize)
+ memset(data + ucb->dsize, 0, size - esize - ucb->dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
@@ -1395,21 +1448,21 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
/* uprobe profile handler */
static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb, int dsize)
+ struct uprobe_cpu_buffer **ucbp)
{
- if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
+ if (!uprobe_perf_filter(&tu->consumer, current->mm))
return UPROBE_HANDLER_REMOVE;
if (!is_ret_probe(tu))
- __uprobe_perf_func(tu, 0, regs, ucb, dsize);
+ __uprobe_perf_func(tu, 0, regs, ucbp);
return 0;
}
static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb, int dsize)
+ struct uprobe_cpu_buffer **ucbp)
{
- __uprobe_perf_func(tu, func, regs, ucb, dsize);
+ __uprobe_perf_func(tu, func, regs, ucbp);
}
int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
@@ -1470,17 +1523,17 @@ trace_uprobe_register(struct trace_event_call *event, enum trace_reg type,
}
}
-static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
+static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs,
+ __u64 *data)
{
struct trace_uprobe *tu;
struct uprobe_dispatch_data udd;
- struct uprobe_cpu_buffer *ucb;
- int dsize, esize;
+ struct uprobe_cpu_buffer *ucb = NULL;
int ret = 0;
-
tu = container_of(con, struct trace_uprobe, consumer);
- tu->nhit++;
+
+ this_cpu_inc(*tu->nhits);
udd.tu = tu;
udd.bp_addr = instruction_pointer(regs);
@@ -1490,30 +1543,24 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
if (WARN_ON_ONCE(!uprobe_cpu_buffer))
return 0;
- dsize = __get_data_size(&tu->tp, regs);
- esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
-
- ucb = uprobe_buffer_get();
- store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize);
-
if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
- ret |= uprobe_trace_func(tu, regs, ucb, dsize);
+ ret |= uprobe_trace_func(tu, regs, &ucb);
#ifdef CONFIG_PERF_EVENTS
if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE))
- ret |= uprobe_perf_func(tu, regs, ucb, dsize);
+ ret |= uprobe_perf_func(tu, regs, &ucb);
#endif
uprobe_buffer_put(ucb);
return ret;
}
static int uretprobe_dispatcher(struct uprobe_consumer *con,
- unsigned long func, struct pt_regs *regs)
+ unsigned long func, struct pt_regs *regs,
+ __u64 *data)
{
struct trace_uprobe *tu;
struct uprobe_dispatch_data udd;
- struct uprobe_cpu_buffer *ucb;
- int dsize, esize;
+ struct uprobe_cpu_buffer *ucb = NULL;
tu = container_of(con, struct trace_uprobe, consumer);
@@ -1525,18 +1572,12 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
if (WARN_ON_ONCE(!uprobe_cpu_buffer))
return 0;
- dsize = __get_data_size(&tu->tp, regs);
- esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
-
- ucb = uprobe_buffer_get();
- store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize);
-
if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
- uretprobe_trace_func(tu, func, regs, ucb, dsize);
+ uretprobe_trace_func(tu, func, regs, &ucb);
#ifdef CONFIG_PERF_EVENTS
if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE))
- uretprobe_perf_func(tu, func, regs, ucb, dsize);
+ uretprobe_perf_func(tu, func, regs, &ucb);
#endif
uprobe_buffer_put(ucb);
return 0;
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index a4dcf0f24352..1921ade45be3 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -454,7 +454,7 @@ static struct tracing_map_elt *get_free_elt(struct tracing_map *map)
struct tracing_map_elt *elt = NULL;
int idx;
- idx = atomic_inc_return(&map->next_elt);
+ idx = atomic_fetch_add_unless(&map->next_elt, 1, map->max_elts);
if (idx < map->max_elts) {
elt = *(TRACING_MAP_ELT(map->elts, idx));
if (map->ops && map->ops->elt_init)
@@ -699,7 +699,7 @@ void tracing_map_clear(struct tracing_map *map)
{
unsigned int i;
- atomic_set(&map->next_elt, -1);
+ atomic_set(&map->next_elt, 0);
atomic64_set(&map->hits, 0);
atomic64_set(&map->drops, 0);
@@ -783,7 +783,7 @@ struct tracing_map *tracing_map_create(unsigned int map_bits,
map->map_bits = map_bits;
map->max_elts = (1 << map_bits);
- atomic_set(&map->next_elt, -1);
+ atomic_set(&map->next_elt, 0);
map->map_size = (1 << (map_bits + 1));
map->ops = ops;
@@ -845,15 +845,11 @@ int tracing_map_init(struct tracing_map *map)
static int cmp_entries_dup(const void *A, const void *B)
{
const struct tracing_map_sort_entry *a, *b;
- int ret = 0;
a = *(const struct tracing_map_sort_entry **)A;
b = *(const struct tracing_map_sort_entry **)B;
- if (memcmp(a->key, b->key, a->elt->map->key_size))
- ret = 1;
-
- return ret;
+ return memcmp(a->key, b->key, a->elt->map->key_size);
}
static int cmp_entries_sum(const void *A, const void *B)