summaryrefslogtreecommitdiff
path: root/kernel/trace
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/Kconfig64
-rw-r--r--kernel/trace/blktrace.c50
-rw-r--r--kernel/trace/bpf_trace.c911
-rw-r--r--kernel/trace/bpf_trace.h2
-rw-r--r--kernel/trace/fgraph.c1222
-rw-r--r--kernel/trace/fprobe.c785
-rw-r--r--kernel/trace/ftrace.c1308
-rw-r--r--kernel/trace/ftrace_internal.h18
-rw-r--r--kernel/trace/pid_list.c15
-rw-r--r--kernel/trace/preemptirq_delay_test.c3
-rw-r--r--kernel/trace/rethook.c4
-rw-r--r--kernel/trace/ring_buffer.c1798
-rw-r--r--kernel/trace/ring_buffer_benchmark.c4
-rw-r--r--kernel/trace/rv/Kconfig34
-rw-r--r--kernel/trace/rv/Makefile10
-rw-r--r--kernel/trace/rv/monitors/sched/Kconfig11
-rw-r--r--kernel/trace/rv/monitors/sched/sched.c38
-rw-r--r--kernel/trace/rv/monitors/sched/sched.h3
-rw-r--r--kernel/trace/rv/monitors/sco/Kconfig14
-rw-r--r--kernel/trace/rv/monitors/sco/sco.c88
-rw-r--r--kernel/trace/rv/monitors/sco/sco.h47
-rw-r--r--kernel/trace/rv/monitors/sco/sco_trace.h15
-rw-r--r--kernel/trace/rv/monitors/scpd/Kconfig15
-rw-r--r--kernel/trace/rv/monitors/scpd/scpd.c96
-rw-r--r--kernel/trace/rv/monitors/scpd/scpd.h49
-rw-r--r--kernel/trace/rv/monitors/scpd/scpd_trace.h15
-rw-r--r--kernel/trace/rv/monitors/sncid/Kconfig15
-rw-r--r--kernel/trace/rv/monitors/sncid/sncid.c96
-rw-r--r--kernel/trace/rv/monitors/sncid/sncid.h49
-rw-r--r--kernel/trace/rv/monitors/sncid/sncid_trace.h15
-rw-r--r--kernel/trace/rv/monitors/snep/Kconfig15
-rw-r--r--kernel/trace/rv/monitors/snep/snep.c96
-rw-r--r--kernel/trace/rv/monitors/snep/snep.h49
-rw-r--r--kernel/trace/rv/monitors/snep/snep_trace.h15
-rw-r--r--kernel/trace/rv/monitors/snroc/Kconfig14
-rw-r--r--kernel/trace/rv/monitors/snroc/snroc.c85
-rw-r--r--kernel/trace/rv/monitors/snroc/snroc.h47
-rw-r--r--kernel/trace/rv/monitors/snroc/snroc_trace.h15
-rw-r--r--kernel/trace/rv/monitors/tss/Kconfig14
-rw-r--r--kernel/trace/rv/monitors/tss/tss.c91
-rw-r--r--kernel/trace/rv/monitors/tss/tss.h47
-rw-r--r--kernel/trace/rv/monitors/tss/tss_trace.h15
-rw-r--r--kernel/trace/rv/monitors/wip/Kconfig14
-rw-r--r--kernel/trace/rv/monitors/wip/wip.c4
-rw-r--r--kernel/trace/rv/monitors/wip/wip.h1
-rw-r--r--kernel/trace/rv/monitors/wip/wip_trace.h15
-rw-r--r--kernel/trace/rv/monitors/wwnr/Kconfig13
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.c4
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.h1
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr_trace.h16
-rw-r--r--kernel/trace/rv/rv.c169
-rw-r--r--kernel/trace/rv/rv.h4
-rw-r--r--kernel/trace/rv/rv_reactors.c29
-rw-r--r--kernel/trace/rv/rv_trace.h136
-rw-r--r--kernel/trace/trace.c1807
-rw-r--r--kernel/trace/trace.h251
-rw-r--r--kernel/trace/trace_benchmark.c2
-rw-r--r--kernel/trace/trace_branch.c14
-rw-r--r--kernel/trace/trace_clock.c2
-rw-r--r--kernel/trace/trace_dynevent.c39
-rw-r--r--kernel/trace/trace_dynevent.h1
-rw-r--r--kernel/trace/trace_entries.h57
-rw-r--r--kernel/trace/trace_eprobe.c49
-rw-r--r--kernel/trace/trace_event_perf.c10
-rw-r--r--kernel/trace/trace_events.c834
-rw-r--r--kernel/trace/trace_events_filter.c35
-rw-r--r--kernel/trace/trace_events_hist.c375
-rw-r--r--kernel/trace/trace_events_inject.c2
-rw-r--r--kernel/trace/trace_events_synth.c58
-rw-r--r--kernel/trace/trace_events_trigger.c175
-rw-r--r--kernel/trace/trace_events_user.c92
-rw-r--r--kernel/trace/trace_fprobe.c468
-rw-r--r--kernel/trace/trace_functions.c114
-rw-r--r--kernel/trace/trace_functions_graph.c572
-rw-r--r--kernel/trace/trace_hwlat.c6
-rw-r--r--kernel/trace/trace_irqsoff.c90
-rw-r--r--kernel/trace/trace_kdb.c22
-rw-r--r--kernel/trace/trace_kprobe.c367
-rw-r--r--kernel/trace/trace_mmiotrace.c20
-rw-r--r--kernel/trace/trace_osnoise.c222
-rw-r--r--kernel/trace/trace_output.c206
-rw-r--r--kernel/trace/trace_output.h9
-rw-r--r--kernel/trace/trace_preemptirq.c37
-rw-r--r--kernel/trace/trace_probe.c141
-rw-r--r--kernel/trace/trace_probe.h8
-rw-r--r--kernel/trace/trace_probe_tmpl.h2
-rw-r--r--kernel/trace/trace_sched_switch.c4
-rw-r--r--kernel/trace/trace_sched_wakeup.c64
-rw-r--r--kernel/trace/trace_selftest.c278
-rw-r--r--kernel/trace/trace_stack.c32
-rw-r--r--kernel/trace/trace_stat.c26
-rw-r--r--kernel/trace/trace_syscalls.c40
-rw-r--r--kernel/trace/trace_uprobe.c230
-rw-r--r--kernel/trace/tracing_map.c12
94 files changed, 11068 insertions, 3408 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 47345bf1d4a9..a3f35c7d83b6 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -31,9 +31,14 @@ config HAVE_FUNCTION_GRAPH_TRACER
help
See Documentation/trace/ftrace-design.rst
-config HAVE_FUNCTION_GRAPH_RETVAL
+config HAVE_FUNCTION_GRAPH_FREGS
bool
+config HAVE_FTRACE_GRAPH_FUNC
+ bool
+ help
+ True if ftrace_graph_func() is defined.
+
config HAVE_DYNAMIC_FTRACE
bool
help
@@ -57,6 +62,12 @@ config HAVE_DYNAMIC_FTRACE_WITH_ARGS
This allows for use of ftrace_regs_get_argument() and
ftrace_regs_get_stack_pointer().
+config HAVE_FTRACE_REGS_HAVING_PT_REGS
+ bool
+ help
+ If this is set, ftrace_regs has pt_regs, thus it can convert to
+ pt_regs without allocating memory.
+
config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
bool
help
@@ -163,7 +174,7 @@ config TRACING
select BINARY_PRINTF
select EVENT_TRACING
select TRACE_CLOCK
- select TASKS_RCU if PREEMPTION
+ select NEED_TASKS_RCU
config GENERIC_TRACER
bool
@@ -204,7 +215,7 @@ config FUNCTION_TRACER
select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
select GLOB
- select TASKS_RCU if PREEMPTION
+ select NEED_TASKS_RCU
select TASKS_RUDE_RCU
help
Enable the kernel to trace every kernel function. This is done
@@ -232,7 +243,7 @@ config FUNCTION_GRAPH_TRACER
config FUNCTION_GRAPH_RETVAL
bool "Kernel Function Graph Return Value"
- depends on HAVE_FUNCTION_GRAPH_RETVAL
+ depends on HAVE_FUNCTION_GRAPH_FREGS
depends on FUNCTION_GRAPH_TRACER
default n
help
@@ -242,6 +253,27 @@ config FUNCTION_GRAPH_RETVAL
enable it via the trace option funcgraph-retval.
See Documentation/trace/ftrace.rst
+config FUNCTION_GRAPH_RETADDR
+ bool "Kernel Function Graph Return Address"
+ depends on FUNCTION_GRAPH_TRACER
+ default n
+ help
+ Support recording and printing the function return address when
+ using function graph tracer. It can be helpful to locate code line that
+ the function is called. This feature is off by default, and you can
+ enable it via the trace option funcgraph-retaddr.
+
+config FUNCTION_TRACE_ARGS
+ bool
+ depends on PROBE_EVENTS_BTF_ARGS
+ default y
+ help
+ If supported with function argument access API and BTF, then
+ the function tracer and function graph tracer will support printing
+ of function arguments. This feature is off by default, and can be
+ enabled via the trace option func-args (for the function tracer) and
+ funcgraph-args (for the function graph tracer)
+
config DYNAMIC_FTRACE
bool "enable/disable function tracing dynamically"
depends on FUNCTION_TRACER
@@ -286,10 +318,9 @@ config DYNAMIC_FTRACE_WITH_ARGS
config FPROBE
bool "Kernel Function Probe (fprobe)"
- depends on FUNCTION_TRACER
- depends on DYNAMIC_FTRACE_WITH_REGS
- depends on HAVE_RETHOOK
- select RETHOOK
+ depends on HAVE_FUNCTION_GRAPH_FREGS && HAVE_FTRACE_GRAPH_FUNC
+ depends on DYNAMIC_FTRACE_WITH_ARGS
+ select FUNCTION_GRAPH_TRACER
default n
help
This option enables kernel function probe (fprobe) based on ftrace.
@@ -974,6 +1005,19 @@ config FTRACE_RECORD_RECURSION_SIZE
This file can be reset, but the limit can not change in
size at runtime.
+config FTRACE_VALIDATE_RCU_IS_WATCHING
+ bool "Validate RCU is on during ftrace execution"
+ depends on FUNCTION_TRACER
+ depends on ARCH_WANTS_NO_INSTR
+ help
+ All callbacks that attach to the function tracing have some sort of
+ protection against recursion. This option is only to verify that
+ ftrace (and other users of ftrace_test_recursion_trylock()) are not
+ called outside of RCU, as if they are, it can cause a race. But it
+ also has a noticeable overhead when enabled.
+
+ If unsure, say N
+
config RING_BUFFER_RECORD_RECURSION
bool "Record functions that recurse in the ring buffer"
depends on FTRACE_RECORD_RECURSION
@@ -1123,7 +1167,7 @@ config PREEMPTIRQ_DELAY_TEST
config SYNTH_EVENT_GEN_TEST
tristate "Test module for in-kernel synthetic event generation"
- depends on SYNTH_EVENTS
+ depends on SYNTH_EVENTS && m
help
This option creates a test module to check the base
functionality of in-kernel synthetic event definition and
@@ -1136,7 +1180,7 @@ config SYNTH_EVENT_GEN_TEST
config KPROBE_EVENT_GEN_TEST
tristate "Test module for in-kernel kprobe event generation"
- depends on KPROBE_EVENTS
+ depends on KPROBE_EVENTS && m
help
This option creates a test module to check the base
functionality of in-kernel kprobe event definition.
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d5d94510afd3..3f6a7bdc6edf 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -524,8 +524,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (!buts->buf_size || !buts->buf_nr)
return -EINVAL;
- strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
- buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
+ strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE);
/*
* some device names have larger paths - convert the slashes
@@ -618,8 +617,9 @@ err:
return ret;
}
-static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
- struct block_device *bdev, char __user *arg)
+int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+ struct block_device *bdev,
+ char __user *arg)
{
struct blk_user_trace_setup buts;
int ret;
@@ -628,29 +628,18 @@ static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (ret)
return -EFAULT;
+ mutex_lock(&q->debugfs_mutex);
ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
+ mutex_unlock(&q->debugfs_mutex);
if (ret)
return ret;
if (copy_to_user(arg, &buts, sizeof(buts))) {
- __blk_trace_remove(q);
+ blk_trace_remove(q);
return -EFAULT;
}
return 0;
}
-
-int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
- struct block_device *bdev,
- char __user *arg)
-{
- int ret;
-
- mutex_lock(&q->debugfs_mutex);
- ret = __blk_trace_setup(q, name, dev, bdev, arg);
- mutex_unlock(&q->debugfs_mutex);
-
- return ret;
-}
EXPORT_SYMBOL_GPL(blk_trace_setup);
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
@@ -674,12 +663,14 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
.pid = cbuts.pid,
};
+ mutex_lock(&q->debugfs_mutex);
ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
+ mutex_unlock(&q->debugfs_mutex);
if (ret)
return ret;
if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
- __blk_trace_remove(q);
+ blk_trace_remove(q);
return -EFAULT;
}
@@ -733,12 +724,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
int ret, start = 0;
char b[BDEVNAME_SIZE];
- mutex_lock(&q->debugfs_mutex);
-
switch (cmd) {
case BLKTRACESETUP:
snprintf(b, sizeof(b), "%pg", bdev);
- ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
+ ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
break;
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
case BLKTRACESETUP32:
@@ -750,17 +739,15 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
start = 1;
fallthrough;
case BLKTRACESTOP:
- ret = __blk_trace_startstop(q, start);
+ ret = blk_trace_startstop(q, start);
break;
case BLKTRACETEARDOWN:
- ret = __blk_trace_remove(q);
+ ret = blk_trace_remove(q);
break;
default:
ret = -ENOTTY;
break;
}
-
- mutex_unlock(&q->debugfs_mutex);
return ret;
}
@@ -906,11 +893,6 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
rcu_read_unlock();
}
-static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio)
-{
- blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BOUNCE, 0);
-}
-
static void blk_add_trace_bio_complete(void *ignore,
struct request_queue *q, struct bio *bio)
{
@@ -1102,8 +1084,6 @@ static void blk_register_tracepoints(void)
WARN_ON(ret);
ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
WARN_ON(ret);
- ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
- WARN_ON(ret);
ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
WARN_ON(ret);
ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
@@ -1138,7 +1118,6 @@ static void blk_unregister_tracepoints(void)
unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
- unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL);
@@ -1475,7 +1454,6 @@ static const struct {
[__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug },
[__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic },
[__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split },
- [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic },
[__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap },
};
@@ -1909,6 +1887,8 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf)
rwbs[i++] = 'S';
if (opf & REQ_META)
rwbs[i++] = 'M';
+ if (opf & REQ_ATOMIC)
+ rwbs[i++] = 'U';
rwbs[i] = '\0';
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 9dc605f08a23..132c8be6f635 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -24,7 +24,6 @@
#include <linux/key.h>
#include <linux/verification.h>
#include <linux/namei.h>
-#include <linux/fileattr.h>
#include <net/bpf_sk_storage.h>
@@ -358,17 +357,6 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = {
.arg3_type = ARG_CONST_SIZE,
};
-static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
-{
- if (!capable(CAP_SYS_ADMIN))
- return NULL;
-
- pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
- current->comm, task_pid_nr(current));
-
- return &bpf_probe_write_user_proto;
-}
-
#define MAX_TRACE_PRINTK_VARARGS 3
#define BPF_TRACE_PRINTK_SIZE 1024
@@ -404,7 +392,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = {
.arg2_type = ARG_CONST_SIZE,
};
-static void __set_printk_clr_event(void)
+static void __set_printk_clr_event(struct work_struct *work)
{
/*
* This program might be calling bpf_trace_printk,
@@ -417,10 +405,11 @@ static void __set_printk_clr_event(void)
if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1))
pr_warn_ratelimited("could not enable bpf_trace_printk events");
}
+static DECLARE_WORK(set_printk_work, __set_printk_clr_event);
const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
{
- __set_printk_clr_event();
+ schedule_work(&set_printk_work);
return &bpf_trace_printk_proto;
}
@@ -463,7 +452,7 @@ static const struct bpf_func_proto bpf_trace_vprintk_proto = {
const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void)
{
- __set_printk_clr_event();
+ schedule_work(&set_printk_work);
return &bpf_trace_vprintk_proto;
}
@@ -583,7 +572,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
return value;
}
-static const struct bpf_func_proto bpf_perf_event_read_proto = {
+const struct bpf_func_proto bpf_perf_event_read_proto = {
.func = bpf_perf_event_read,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -618,9 +607,15 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
.arg4_type = ARG_CONST_SIZE,
};
+const struct bpf_func_proto *bpf_get_perf_event_read_value_proto(void)
+{
+ return &bpf_perf_event_read_value_proto;
+}
+
static __always_inline u64
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
- u64 flags, struct perf_sample_data *sd)
+ u64 flags, struct perf_raw_record *raw,
+ struct perf_sample_data *sd)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
unsigned int cpu = smp_processor_id();
@@ -645,6 +640,8 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
if (unlikely(event->oncpu != cpu))
return -EOPNOTSUPP;
+ perf_sample_save_raw_data(sd, event, raw);
+
return perf_event_output(event, sd, regs);
}
@@ -688,9 +685,8 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
}
perf_sample_data_init(sd, 0, 0);
- perf_sample_save_raw_data(sd, &raw);
- err = __bpf_perf_event_output(regs, map, flags, sd);
+ err = __bpf_perf_event_output(regs, map, flags, &raw, sd);
out:
this_cpu_dec(bpf_trace_nest_level);
preempt_enable();
@@ -749,9 +745,8 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
perf_fetch_caller_regs(regs);
perf_sample_data_init(sd, 0, 0);
- perf_sample_save_raw_data(sd, &raw);
- ret = __bpf_perf_event_output(regs, map, flags, sd);
+ ret = __bpf_perf_event_output(regs, map, flags, &raw, sd);
out:
this_cpu_dec(bpf_event_output_nest_level);
preempt_enable();
@@ -798,34 +793,13 @@ const struct bpf_func_proto bpf_task_pt_regs_proto = {
.ret_btf_id = &bpf_task_pt_regs_ids[0],
};
-BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
-{
- struct bpf_array *array = container_of(map, struct bpf_array, map);
- struct cgroup *cgrp;
-
- if (unlikely(idx >= array->map.max_entries))
- return -E2BIG;
-
- cgrp = READ_ONCE(array->ptrs[idx]);
- if (unlikely(!cgrp))
- return -EAGAIN;
-
- return task_under_cgroup_hierarchy(current, cgrp);
-}
-
-static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
- .func = bpf_current_task_under_cgroup,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_ANYTHING,
-};
-
struct send_signal_irq_work {
struct irq_work irq_work;
struct task_struct *task;
u32 sig;
enum pid_type type;
+ bool has_siginfo;
+ struct kernel_siginfo info;
};
static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
@@ -833,30 +807,49 @@ static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
static void do_bpf_send_signal(struct irq_work *entry)
{
struct send_signal_irq_work *work;
+ struct kernel_siginfo *siginfo;
work = container_of(entry, struct send_signal_irq_work, irq_work);
- group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type);
+ siginfo = work->has_siginfo ? &work->info : SEND_SIG_PRIV;
+
+ group_send_sig_info(work->sig, siginfo, work->task, work->type);
put_task_struct(work->task);
}
-static int bpf_send_signal_common(u32 sig, enum pid_type type)
+static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struct *task, u64 value)
{
struct send_signal_irq_work *work = NULL;
+ struct kernel_siginfo info;
+ struct kernel_siginfo *siginfo;
+
+ if (!task) {
+ task = current;
+ siginfo = SEND_SIG_PRIV;
+ } else {
+ clear_siginfo(&info);
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = SI_KERNEL;
+ info.si_pid = 0;
+ info.si_uid = 0;
+ info.si_value.sival_ptr = (void *)(unsigned long)value;
+ siginfo = &info;
+ }
/* Similar to bpf_probe_write_user, task needs to be
* in a sound condition and kernel memory access be
* permitted in order to send signal to the current
* task.
*/
- if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING)))
+ if (unlikely(task->flags & (PF_KTHREAD | PF_EXITING)))
return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
/* Task should not be pid=1 to avoid kernel panic. */
- if (unlikely(is_global_init(current)))
+ if (unlikely(is_global_init(task)))
return -EPERM;
- if (irqs_disabled()) {
+ if (preempt_count() != 0 || irqs_disabled()) {
/* Do an early check on signal validity. Otherwise,
* the error is lost in deferred irq_work.
*/
@@ -871,22 +864,25 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
* to the irq_work. The current task may change when queued
* irq works get executed.
*/
- work->task = get_task_struct(current);
+ work->task = get_task_struct(task);
+ work->has_siginfo = siginfo == &info;
+ if (work->has_siginfo)
+ copy_siginfo(&work->info, &info);
work->sig = sig;
work->type = type;
irq_work_queue(&work->irq_work);
return 0;
}
- return group_send_sig_info(sig, SEND_SIG_PRIV, current, type);
+ return group_send_sig_info(sig, siginfo, task, type);
}
BPF_CALL_1(bpf_send_signal, u32, sig)
{
- return bpf_send_signal_common(sig, PIDTYPE_TGID);
+ return bpf_send_signal_common(sig, PIDTYPE_TGID, NULL, 0);
}
-static const struct bpf_func_proto bpf_send_signal_proto = {
+const struct bpf_func_proto bpf_send_signal_proto = {
.func = bpf_send_signal,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -895,10 +891,10 @@ static const struct bpf_func_proto bpf_send_signal_proto = {
BPF_CALL_1(bpf_send_signal_thread, u32, sig)
{
- return bpf_send_signal_common(sig, PIDTYPE_PID);
+ return bpf_send_signal_common(sig, PIDTYPE_PID, NULL, 0);
}
-static const struct bpf_func_proto bpf_send_signal_thread_proto = {
+const struct bpf_func_proto bpf_send_signal_thread_proto = {
.func = bpf_send_signal_thread,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1048,21 +1044,14 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = {
.arg1_type = ARG_PTR_TO_CTX,
};
-#ifdef CONFIG_X86_KERNEL_IBT
-static unsigned long get_entry_ip(unsigned long fentry_ip)
+static inline unsigned long get_entry_ip(unsigned long fentry_ip)
{
- u32 instr;
-
- /* Being extra safe in here in case entry ip is on the page-edge. */
- if (get_kernel_nofault(instr, (u32 *) fentry_ip - 1))
- return fentry_ip;
- if (is_endbr(instr))
+#ifdef CONFIG_X86_KERNEL_IBT
+ if (is_endbr((void *)(fentry_ip - ENDBR_INSN_SIZE)))
fentry_ip -= ENDBR_INSN_SIZE;
+#endif
return fentry_ip;
}
-#else
-#define get_entry_ip(fentry_ip) fentry_ip
-#endif
BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs)
{
@@ -1182,9 +1171,6 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = {
BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags)
{
-#ifndef CONFIG_X86
- return -ENOENT;
-#else
static const u32 br_entry_size = sizeof(struct perf_branch_entry);
u32 entry_cnt = size / br_entry_size;
@@ -1197,10 +1183,9 @@ BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags)
return -ENOENT;
return entry_cnt * br_entry_size;
-#endif
}
-static const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
+const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
.func = bpf_get_branch_snapshot,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -1224,7 +1209,8 @@ static const struct bpf_func_proto bpf_get_func_arg_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_LONG,
+ .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
+ .arg3_size = sizeof(u64),
};
BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value)
@@ -1240,7 +1226,8 @@ static const struct bpf_func_proto bpf_get_func_ret_proto = {
.func = get_func_ret,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_LONG,
+ .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
+ .arg2_size = sizeof(u64),
};
BPF_CALL_1(get_func_arg_cnt, void *, ctx)
@@ -1367,8 +1354,8 @@ __bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
/**
* bpf_verify_pkcs7_signature - verify a PKCS#7 signature
- * @data_ptr: data to verify
- * @sig_ptr: signature of the data
+ * @data_p: data to verify
+ * @sig_p: signature of the data
* @trusted_keyring: keyring with keys trusted for signature verification
*
* Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr*
@@ -1376,10 +1363,12 @@ __bpf_kfunc void bpf_key_put(struct bpf_key *bkey)
*
* Return: 0 on success, a negative value on error.
*/
-__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
- struct bpf_dynptr_kern *sig_ptr,
+__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p,
+ struct bpf_dynptr *sig_p,
struct bpf_key *trusted_keyring)
{
+ struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p;
+ struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p;
const void *data, *sig;
u32 data_len, sig_len;
int ret;
@@ -1435,133 +1424,14 @@ static int __init bpf_key_sig_kfuncs_init(void)
late_initcall(bpf_key_sig_kfuncs_init);
#endif /* CONFIG_KEYS */
-/* filesystem kfuncs */
-__bpf_kfunc_start_defs();
-
-/**
- * bpf_get_file_xattr - get xattr of a file
- * @file: file to get xattr from
- * @name__str: name of the xattr
- * @value_ptr: output buffer of the xattr value
- *
- * Get xattr *name__str* of *file* and store the output in *value_ptr*.
- *
- * For security reasons, only *name__str* with prefix "user." is allowed.
- *
- * Return: 0 on success, a negative value on error.
- */
-__bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str,
- struct bpf_dynptr_kern *value_ptr)
-{
- struct dentry *dentry;
- u32 value_len;
- void *value;
- int ret;
-
- if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
- return -EPERM;
-
- value_len = __bpf_dynptr_size(value_ptr);
- value = __bpf_dynptr_data_rw(value_ptr, value_len);
- if (!value)
- return -EINVAL;
-
- dentry = file_dentry(file);
- ret = inode_permission(&nop_mnt_idmap, dentry->d_inode, MAY_READ);
- if (ret)
- return ret;
- return __vfs_getxattr(dentry, dentry->d_inode, name__str, value, value_len);
-}
-
-__bpf_kfunc_end_defs();
-
-BTF_KFUNCS_START(fs_kfunc_set_ids)
-BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
-BTF_KFUNCS_END(fs_kfunc_set_ids)
-
-static int bpf_get_file_xattr_filter(const struct bpf_prog *prog, u32 kfunc_id)
-{
- if (!btf_id_set8_contains(&fs_kfunc_set_ids, kfunc_id))
- return 0;
-
- /* Only allow to attach from LSM hooks, to avoid recursion */
- return prog->type != BPF_PROG_TYPE_LSM ? -EACCES : 0;
-}
-
-static const struct btf_kfunc_id_set bpf_fs_kfunc_set = {
- .owner = THIS_MODULE,
- .set = &fs_kfunc_set_ids,
- .filter = bpf_get_file_xattr_filter,
-};
-
-static int __init bpf_fs_kfuncs_init(void)
-{
- return register_btf_kfunc_id_set(BPF_PROG_TYPE_LSM, &bpf_fs_kfunc_set);
-}
-
-late_initcall(bpf_fs_kfuncs_init);
-
static const struct bpf_func_proto *
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
switch (func_id) {
- case BPF_FUNC_map_lookup_elem:
- return &bpf_map_lookup_elem_proto;
- case BPF_FUNC_map_update_elem:
- return &bpf_map_update_elem_proto;
- case BPF_FUNC_map_delete_elem:
- return &bpf_map_delete_elem_proto;
- case BPF_FUNC_map_push_elem:
- return &bpf_map_push_elem_proto;
- case BPF_FUNC_map_pop_elem:
- return &bpf_map_pop_elem_proto;
- case BPF_FUNC_map_peek_elem:
- return &bpf_map_peek_elem_proto;
- case BPF_FUNC_map_lookup_percpu_elem:
- return &bpf_map_lookup_percpu_elem_proto;
- case BPF_FUNC_ktime_get_ns:
- return &bpf_ktime_get_ns_proto;
- case BPF_FUNC_ktime_get_boot_ns:
- return &bpf_ktime_get_boot_ns_proto;
- case BPF_FUNC_tail_call:
- return &bpf_tail_call_proto;
- case BPF_FUNC_get_current_pid_tgid:
- return &bpf_get_current_pid_tgid_proto;
- case BPF_FUNC_get_current_task:
- return &bpf_get_current_task_proto;
- case BPF_FUNC_get_current_task_btf:
- return &bpf_get_current_task_btf_proto;
- case BPF_FUNC_task_pt_regs:
- return &bpf_task_pt_regs_proto;
- case BPF_FUNC_get_current_uid_gid:
- return &bpf_get_current_uid_gid_proto;
- case BPF_FUNC_get_current_comm:
- return &bpf_get_current_comm_proto;
- case BPF_FUNC_trace_printk:
- return bpf_get_trace_printk_proto();
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_smp_processor_id_proto;
- case BPF_FUNC_get_numa_node_id:
- return &bpf_get_numa_node_id_proto;
- case BPF_FUNC_perf_event_read:
- return &bpf_perf_event_read_proto;
- case BPF_FUNC_current_task_under_cgroup:
- return &bpf_current_task_under_cgroup_proto;
- case BPF_FUNC_get_prandom_u32:
- return &bpf_get_prandom_u32_proto;
- case BPF_FUNC_probe_write_user:
- return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ?
- NULL : bpf_get_probe_write_proto();
- case BPF_FUNC_probe_read_user:
- return &bpf_probe_read_user_proto;
- case BPF_FUNC_probe_read_kernel:
- return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
- NULL : &bpf_probe_read_kernel_proto;
- case BPF_FUNC_probe_read_user_str:
- return &bpf_probe_read_user_str_proto;
- case BPF_FUNC_probe_read_kernel_str:
- return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
- NULL : &bpf_probe_read_kernel_str_proto;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
case BPF_FUNC_probe_read:
return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
@@ -1570,69 +1440,50 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_compat_str_proto;
#endif
-#ifdef CONFIG_CGROUPS
- case BPF_FUNC_cgrp_storage_get:
- return &bpf_cgrp_storage_get_proto;
- case BPF_FUNC_cgrp_storage_delete:
- return &bpf_cgrp_storage_delete_proto;
-#endif
- case BPF_FUNC_send_signal:
- return &bpf_send_signal_proto;
- case BPF_FUNC_send_signal_thread:
- return &bpf_send_signal_thread_proto;
- case BPF_FUNC_perf_event_read_value:
- return &bpf_perf_event_read_value_proto;
- case BPF_FUNC_get_ns_current_pid_tgid:
- return &bpf_get_ns_current_pid_tgid_proto;
- case BPF_FUNC_ringbuf_output:
- return &bpf_ringbuf_output_proto;
- case BPF_FUNC_ringbuf_reserve:
- return &bpf_ringbuf_reserve_proto;
- case BPF_FUNC_ringbuf_submit:
- return &bpf_ringbuf_submit_proto;
- case BPF_FUNC_ringbuf_discard:
- return &bpf_ringbuf_discard_proto;
- case BPF_FUNC_ringbuf_query:
- return &bpf_ringbuf_query_proto;
- case BPF_FUNC_jiffies64:
- return &bpf_jiffies64_proto;
- case BPF_FUNC_get_task_stack:
- return &bpf_get_task_stack_proto;
- case BPF_FUNC_copy_from_user:
- return &bpf_copy_from_user_proto;
- case BPF_FUNC_copy_from_user_task:
- return &bpf_copy_from_user_task_proto;
- case BPF_FUNC_snprintf_btf:
- return &bpf_snprintf_btf_proto;
- case BPF_FUNC_per_cpu_ptr:
- return &bpf_per_cpu_ptr_proto;
- case BPF_FUNC_this_cpu_ptr:
- return &bpf_this_cpu_ptr_proto;
- case BPF_FUNC_task_storage_get:
- if (bpf_prog_check_recur(prog))
- return &bpf_task_storage_get_recur_proto;
- return &bpf_task_storage_get_proto;
- case BPF_FUNC_task_storage_delete:
- if (bpf_prog_check_recur(prog))
- return &bpf_task_storage_delete_recur_proto;
- return &bpf_task_storage_delete_proto;
- case BPF_FUNC_for_each_map_elem:
- return &bpf_for_each_map_elem_proto;
- case BPF_FUNC_snprintf:
- return &bpf_snprintf_proto;
case BPF_FUNC_get_func_ip:
return &bpf_get_func_ip_proto_tracing;
- case BPF_FUNC_get_branch_snapshot:
- return &bpf_get_branch_snapshot_proto;
- case BPF_FUNC_find_vma:
- return &bpf_find_vma_proto;
- case BPF_FUNC_trace_vprintk:
- return bpf_get_trace_vprintk_proto();
default:
- return bpf_base_func_proto(func_id, prog);
+ break;
+ }
+
+ func_proto = bpf_base_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ if (!bpf_token_capable(prog->aux->token, CAP_SYS_ADMIN))
+ return NULL;
+
+ switch (func_id) {
+ case BPF_FUNC_probe_write_user:
+ return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ?
+ NULL : &bpf_probe_write_user_proto;
+ default:
+ return NULL;
}
}
+static bool is_kprobe_multi(const struct bpf_prog *prog)
+{
+ return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ||
+ prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
+}
+
+static inline bool is_kprobe_session(const struct bpf_prog *prog)
+{
+ return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
+}
+
+static inline bool is_uprobe_multi(const struct bpf_prog *prog)
+{
+ return prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI ||
+ prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION;
+}
+
+static inline bool is_uprobe_session(const struct bpf_prog *prog)
+{
+ return prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION;
+}
+
static const struct bpf_func_proto *
kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -1642,21 +1493,21 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto;
case BPF_FUNC_get_stack:
- return &bpf_get_stack_proto;
+ return prog->sleepable ? &bpf_get_stack_sleepable_proto : &bpf_get_stack_proto;
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
case BPF_FUNC_override_return:
return &bpf_override_return_proto;
#endif
case BPF_FUNC_get_func_ip:
- if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI)
+ if (is_kprobe_multi(prog))
return &bpf_get_func_ip_proto_kprobe_multi;
- if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
+ if (is_uprobe_multi(prog))
return &bpf_get_func_ip_proto_uprobe_multi;
return &bpf_get_func_ip_proto_kprobe;
case BPF_FUNC_get_attach_cookie:
- if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI)
+ if (is_kprobe_multi(prog))
return &bpf_get_attach_cookie_proto_kmulti;
- if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
+ if (is_uprobe_multi(prog))
return &bpf_get_attach_cookie_proto_umulti;
return &bpf_get_attach_cookie_proto_trace;
default:
@@ -1902,7 +1753,7 @@ static struct pt_regs *get_bpf_raw_tp_regs(void)
struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);
- if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
+ if (nest_level > ARRAY_SIZE(tp_regs->regs)) {
this_cpu_dec(bpf_raw_tp_nest_level);
return ERR_PTR(-EBUSY);
}
@@ -2008,6 +1859,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_stackid_proto_raw_tp;
case BPF_FUNC_get_stack:
return &bpf_get_stack_proto_raw_tp;
+ case BPF_FUNC_get_attach_cookie:
+ return &bpf_get_attach_cookie_proto_tracing;
default:
return bpf_tracing_func_proto(func_id, prog);
}
@@ -2070,6 +1923,9 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_func_arg_cnt:
return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL;
case BPF_FUNC_get_attach_cookie:
+ if (prog->type == BPF_PROG_TYPE_TRACING &&
+ prog->expected_attach_type == BPF_TRACE_RAW_TP)
+ return &bpf_get_attach_cookie_proto_tracing;
return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL;
default:
fn = raw_tp_prog_func_proto(func_id, prog);
@@ -2278,6 +2134,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
{
struct bpf_prog_array *old_array;
struct bpf_prog_array *new_array;
+ struct bpf_prog *prog = NULL;
int ret;
mutex_lock(&bpf_event_mutex);
@@ -2286,9 +2143,10 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
goto unlock;
old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
+ if (!old_array)
+ goto put;
+
ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array);
- if (ret == -ENOENT)
- goto unlock;
if (ret < 0) {
bpf_prog_array_delete_safe(old_array, event->prog);
} else {
@@ -2296,11 +2154,23 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
bpf_prog_array_free_sleepable(old_array);
}
- bpf_prog_put(event->prog);
+put:
+ prog = event->prog;
event->prog = NULL;
unlock:
mutex_unlock(&bpf_event_mutex);
+
+ if (prog) {
+ /*
+ * It could be that the bpf_prog is not sleepable (and will be freed
+ * via normal RCU), but is called from a point that supports sleepable
+ * programs and uses tasks-trace-RCU.
+ */
+ synchronize_rcu_tasks_trace();
+
+ bpf_prog_put(prog);
+ }
}
int perf_event_query_prog_array(struct perf_event *event, void __user *info)
@@ -2363,23 +2233,32 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
{
struct module *mod;
- preempt_disable();
+ guard(rcu)();
mod = __module_address((unsigned long)btp);
module_put(mod);
- preempt_enable();
}
static __always_inline
-void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
+void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
{
+ struct bpf_prog *prog = link->link.prog;
+ struct bpf_run_ctx *old_run_ctx;
+ struct bpf_trace_run_ctx run_ctx;
+
cant_sleep();
if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
bpf_prog_inc_misses_counter(prog);
goto out;
}
+
+ run_ctx.bpf_cookie = link->cookie;
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+
rcu_read_lock();
(void) bpf_prog_run(prog, args);
rcu_read_unlock();
+
+ bpf_reset_run_ctx(old_run_ctx);
out:
this_cpu_dec(*(prog->active));
}
@@ -2408,12 +2287,12 @@ out:
#define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
#define BPF_TRACE_DEFN_x(x) \
- void bpf_trace_run##x(struct bpf_prog *prog, \
+ void bpf_trace_run##x(struct bpf_raw_tp_link *link, \
REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \
{ \
u64 args[x]; \
REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \
- __bpf_trace_run(prog, args); \
+ __bpf_trace_run(link, args); \
} \
EXPORT_SYMBOL_GPL(bpf_trace_run##x)
BPF_TRACE_DEFN_x(1);
@@ -2429,9 +2308,10 @@ BPF_TRACE_DEFN_x(10);
BPF_TRACE_DEFN_x(11);
BPF_TRACE_DEFN_x(12);
-static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
+int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link)
{
struct tracepoint *tp = btp->tp;
+ struct bpf_prog *prog = link->link.prog;
/*
* check that program doesn't access arguments beyond what's
@@ -2443,18 +2323,12 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *
if (prog->aux->max_tp_access > btp->writable_size)
return -EINVAL;
- return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func,
- prog);
-}
-
-int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
-{
- return __bpf_probe_register(btp, prog);
+ return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, link);
}
-int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
+int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link)
{
- return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog);
+ return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, link);
}
int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
@@ -2577,6 +2451,12 @@ static int __init bpf_event_init(void)
fs_initcall(bpf_event_init);
#endif /* CONFIG_MODULES */
+struct bpf_session_run_ctx {
+ struct bpf_run_ctx run_ctx;
+ bool is_return;
+ void *data;
+};
+
#ifdef CONFIG_FPROBE
struct bpf_kprobe_multi_link {
struct bpf_link link;
@@ -2590,7 +2470,7 @@ struct bpf_kprobe_multi_link {
};
struct bpf_kprobe_multi_run_ctx {
- struct bpf_run_ctx run_ctx;
+ struct bpf_session_run_ctx session_ctx;
struct bpf_kprobe_multi_link *link;
unsigned long entry_ip;
};
@@ -2600,6 +2480,20 @@ struct user_syms {
char *buf;
};
+#ifndef CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS
+static DEFINE_PER_CPU(struct pt_regs, bpf_kprobe_multi_pt_regs);
+#define bpf_kprobe_multi_pt_regs_ptr() this_cpu_ptr(&bpf_kprobe_multi_pt_regs)
+#else
+#define bpf_kprobe_multi_pt_regs_ptr() (NULL)
+#endif
+
+static unsigned long ftrace_get_entry_ip(unsigned long fentry_ip)
+{
+ unsigned long ip = ftrace_get_symaddr(fentry_ip);
+
+ return ip ? : fentry_ip;
+}
+
static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 cnt)
{
unsigned long __user usymbol;
@@ -2769,7 +2663,8 @@ static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
if (WARN_ON_ONCE(!ctx))
return 0;
- run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx);
+ run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx,
+ session_ctx.run_ctx);
link = run_ctx->link;
if (!link->cookies)
return 0;
@@ -2786,30 +2681,38 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
{
struct bpf_kprobe_multi_run_ctx *run_ctx;
- run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx);
+ run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx,
+ session_ctx.run_ctx);
return run_ctx->entry_ip;
}
static int
kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
- unsigned long entry_ip, struct pt_regs *regs)
+ unsigned long entry_ip, struct ftrace_regs *fregs,
+ bool is_return, void *data)
{
struct bpf_kprobe_multi_run_ctx run_ctx = {
+ .session_ctx = {
+ .is_return = is_return,
+ .data = data,
+ },
.link = link,
.entry_ip = entry_ip,
};
struct bpf_run_ctx *old_run_ctx;
+ struct pt_regs *regs;
int err;
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
bpf_prog_inc_misses_counter(link->link.prog);
- err = 0;
+ err = 1;
goto out;
}
migrate_disable();
rcu_read_lock();
- old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+ regs = ftrace_partial_regs(fregs, bpf_kprobe_multi_pt_regs_ptr());
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx);
err = bpf_prog_run(link->link.prog, regs);
bpf_reset_run_ctx(old_run_ctx);
rcu_read_unlock();
@@ -2822,25 +2725,28 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
static int
kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *data)
{
struct bpf_kprobe_multi_link *link;
+ int err;
link = container_of(fp, struct bpf_kprobe_multi_link, fp);
- kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs);
- return 0;
+ err = kprobe_multi_link_prog_run(link, ftrace_get_entry_ip(fentry_ip),
+ fregs, false, data);
+ return is_kprobe_session(link->link.prog) ? err : 0;
}
static void
kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *data)
{
struct bpf_kprobe_multi_link *link;
link = container_of(fp, struct bpf_kprobe_multi_link, fp);
- kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs);
+ kprobe_multi_link_prog_run(link, ftrace_get_entry_ip(fentry_ip),
+ fregs, true, data);
}
static int symbols_cmp_r(const void *a, const void *b, const void *priv)
@@ -2913,18 +2819,21 @@ static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u3
u32 i, err = 0;
for (i = 0; i < addrs_cnt; i++) {
+ bool skip_add = false;
struct module *mod;
- preempt_disable();
- mod = __module_address(addrs[i]);
- /* Either no module or we it's already stored */
- if (!mod || has_module(&arr, mod)) {
- preempt_enable();
- continue;
+ scoped_guard(rcu) {
+ mod = __module_address(addrs[i]);
+ /* Either no module or it's already stored */
+ if (!mod || has_module(&arr, mod)) {
+ skip_add = true;
+ break; /* scoped_guard */
+ }
+ if (!try_module_get(mod))
+ err = -EINVAL;
}
- if (!try_module_get(mod))
- err = -EINVAL;
- preempt_enable();
+ if (skip_add)
+ continue;
if (err)
break;
err = add_module(&arr, mod);
@@ -2973,7 +2882,10 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (sizeof(u64) != sizeof(void *))
return -EOPNOTSUPP;
- if (prog->expected_attach_type != BPF_TRACE_KPROBE_MULTI)
+ if (attr->link_create.flags)
+ return -EINVAL;
+
+ if (!is_kprobe_multi(prog))
return -EINVAL;
flags = attr->link_create.kprobe_multi.flags;
@@ -3054,10 +2966,12 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (err)
goto error;
- if (flags & BPF_F_KPROBE_MULTI_RETURN)
- link->fp.exit_handler = kprobe_multi_link_exit_handler;
- else
+ if (!(flags & BPF_F_KPROBE_MULTI_RETURN))
link->fp.entry_handler = kprobe_multi_link_handler;
+ if ((flags & BPF_F_KPROBE_MULTI_RETURN) || is_kprobe_session(prog))
+ link->fp.exit_handler = kprobe_multi_link_exit_handler;
+ if (is_kprobe_session(prog))
+ link->fp.entry_data_size = sizeof(u64);
link->addrs = addrs;
link->cookies = cookies;
@@ -3122,7 +3036,9 @@ struct bpf_uprobe {
loff_t offset;
unsigned long ref_ctr_offset;
u64 cookie;
+ struct uprobe *uprobe;
struct uprobe_consumer consumer;
+ bool session;
};
struct bpf_uprobe_multi_link {
@@ -3135,20 +3051,20 @@ struct bpf_uprobe_multi_link {
};
struct bpf_uprobe_multi_run_ctx {
- struct bpf_run_ctx run_ctx;
+ struct bpf_session_run_ctx session_ctx;
unsigned long entry_ip;
struct bpf_uprobe *uprobe;
};
-static void bpf_uprobe_unregister(struct path *path, struct bpf_uprobe *uprobes,
- u32 cnt)
+static void bpf_uprobe_unregister(struct bpf_uprobe *uprobes, u32 cnt)
{
u32 i;
- for (i = 0; i < cnt; i++) {
- uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset,
- &uprobes[i].consumer);
- }
+ for (i = 0; i < cnt; i++)
+ uprobe_unregister_nosync(uprobes[i].uprobe, &uprobes[i].consumer);
+
+ if (cnt)
+ uprobe_unregister_sync();
}
static void bpf_uprobe_multi_link_release(struct bpf_link *link)
@@ -3156,7 +3072,7 @@ static void bpf_uprobe_multi_link_release(struct bpf_link *link)
struct bpf_uprobe_multi_link *umulti_link;
umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
- bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt);
+ bpf_uprobe_unregister(umulti_link->uprobes, umulti_link->cnt);
if (umulti_link->task)
put_task_struct(umulti_link->task);
path_put(&umulti_link->path);
@@ -3182,7 +3098,8 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
struct bpf_uprobe_multi_link *umulti_link;
u32 ucount = info->uprobe_multi.count;
int err = 0, i;
- long left;
+ char *p, *buf;
+ long left = 0;
if (!upath ^ !upath_size)
return -EINVAL;
@@ -3196,26 +3113,23 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
info->uprobe_multi.pid = umulti_link->task ?
task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0;
- if (upath) {
- char *p, *buf;
-
- upath_size = min_t(u32, upath_size, PATH_MAX);
-
- buf = kmalloc(upath_size, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
- p = d_path(&umulti_link->path, buf, upath_size);
- if (IS_ERR(p)) {
- kfree(buf);
- return PTR_ERR(p);
- }
- upath_size = buf + upath_size - p;
- left = copy_to_user(upath, p, upath_size);
+ upath_size = upath_size ? min_t(u32, upath_size, PATH_MAX) : PATH_MAX;
+ buf = kmalloc(upath_size, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ p = d_path(&umulti_link->path, buf, upath_size);
+ if (IS_ERR(p)) {
kfree(buf);
- if (left)
- return -EFAULT;
- info->uprobe_multi.path_size = upath_size;
+ return PTR_ERR(p);
}
+ upath_size = buf + upath_size - p;
+
+ if (upath)
+ left = copy_to_user(upath, p, upath_size);
+ kfree(buf);
+ if (left)
+ return -EFAULT;
+ info->uprobe_multi.path_size = upath_size;
if (!uoffsets && !ucookies && !uref_ctr_offsets)
return 0;
@@ -3248,19 +3162,24 @@ static const struct bpf_link_ops bpf_uprobe_multi_link_lops = {
static int uprobe_prog_run(struct bpf_uprobe *uprobe,
unsigned long entry_ip,
- struct pt_regs *regs)
+ struct pt_regs *regs,
+ bool is_return, void *data)
{
struct bpf_uprobe_multi_link *link = uprobe->link;
struct bpf_uprobe_multi_run_ctx run_ctx = {
+ .session_ctx = {
+ .is_return = is_return,
+ .data = data,
+ },
.entry_ip = entry_ip,
.uprobe = uprobe,
};
struct bpf_prog *prog = link->link.prog;
bool sleepable = prog->sleepable;
struct bpf_run_ctx *old_run_ctx;
- int err = 0;
+ int err;
- if (link->task && current != link->task)
+ if (link->task && !same_thread_group(current, link->task))
return 0;
if (sleepable)
@@ -3270,7 +3189,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe,
migrate_disable();
- old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx);
err = bpf_prog_run(link->link.prog, regs);
bpf_reset_run_ctx(old_run_ctx);
@@ -3284,8 +3203,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe,
}
static bool
-uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx ctx,
- struct mm_struct *mm)
+uprobe_multi_link_filter(struct uprobe_consumer *con, struct mm_struct *mm)
{
struct bpf_uprobe *uprobe;
@@ -3294,28 +3212,36 @@ uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx ctx
}
static int
-uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs)
+uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs,
+ __u64 *data)
{
struct bpf_uprobe *uprobe;
+ int ret;
uprobe = container_of(con, struct bpf_uprobe, consumer);
- return uprobe_prog_run(uprobe, instruction_pointer(regs), regs);
+ ret = uprobe_prog_run(uprobe, instruction_pointer(regs), regs, false, data);
+ if (uprobe->session)
+ return ret ? UPROBE_HANDLER_IGNORE : 0;
+ return 0;
}
static int
-uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs)
+uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs,
+ __u64 *data)
{
struct bpf_uprobe *uprobe;
uprobe = container_of(con, struct bpf_uprobe, consumer);
- return uprobe_prog_run(uprobe, func, regs);
+ uprobe_prog_run(uprobe, func, regs, true, data);
+ return 0;
}
static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
{
struct bpf_uprobe_multi_run_ctx *run_ctx;
- run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx);
+ run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx,
+ session_ctx.run_ctx);
return run_ctx->entry_ip;
}
@@ -3323,7 +3249,8 @@ static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx)
{
struct bpf_uprobe_multi_run_ctx *run_ctx;
- run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx);
+ run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx,
+ session_ctx.run_ctx);
return run_ctx->uprobe->cookie;
}
@@ -3347,7 +3274,10 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (sizeof(u64) != sizeof(void *))
return -EOPNOTSUPP;
- if (prog->expected_attach_type != BPF_TRACE_UPROBE_MULTI)
+ if (attr->link_create.flags)
+ return -EINVAL;
+
+ if (!is_uprobe_multi(prog))
return -EINVAL;
flags = attr->link_create.uprobe_multi.flags;
@@ -3361,8 +3291,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path);
uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets);
cnt = attr->link_create.uprobe_multi.cnt;
+ pid = attr->link_create.uprobe_multi.pid;
- if (!upath || !uoffsets || !cnt)
+ if (!upath || !uoffsets || !cnt || pid < 0)
return -EINVAL;
if (cnt > MAX_UPROBE_MULTI_CNT)
return -E2BIG;
@@ -3386,10 +3317,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
goto error_path_put;
}
- pid = attr->link_create.uprobe_multi.pid;
if (pid) {
rcu_read_lock();
- task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
+ task = get_pid_task(find_vpid(pid), PIDTYPE_TGID);
rcu_read_unlock();
if (!task) {
err = -ESRCH;
@@ -3425,11 +3355,12 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
uprobes[i].link = link;
- if (flags & BPF_F_UPROBE_MULTI_RETURN)
- uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler;
- else
+ if (!(flags & BPF_F_UPROBE_MULTI_RETURN))
uprobes[i].consumer.handler = uprobe_multi_link_handler;
-
+ if (flags & BPF_F_UPROBE_MULTI_RETURN || is_uprobe_session(prog))
+ uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler;
+ if (is_uprobe_session(prog))
+ uprobes[i].session = true;
if (pid)
uprobes[i].consumer.filter = uprobe_multi_link_filter;
}
@@ -3444,22 +3375,26 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
&bpf_uprobe_multi_link_lops, prog);
for (i = 0; i < cnt; i++) {
- err = uprobe_register_refctr(d_real_inode(link->path.dentry),
- uprobes[i].offset,
- uprobes[i].ref_ctr_offset,
- &uprobes[i].consumer);
- if (err) {
- bpf_uprobe_unregister(&path, uprobes, i);
- goto error_free;
+ uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry),
+ uprobes[i].offset,
+ uprobes[i].ref_ctr_offset,
+ &uprobes[i].consumer);
+ if (IS_ERR(uprobes[i].uprobe)) {
+ err = PTR_ERR(uprobes[i].uprobe);
+ link->cnt = i;
+ goto error_unregister;
}
}
err = bpf_link_prime(&link->link, &link_primer);
if (err)
- goto error_free;
+ goto error_unregister;
return bpf_link_settle(&link_primer);
+error_unregister:
+ bpf_uprobe_unregister(uprobes, link->cnt);
+
error_free:
kvfree(uprobes);
kfree(link);
@@ -3483,3 +3418,263 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
return 0;
}
#endif /* CONFIG_UPROBES */
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc bool bpf_session_is_return(void)
+{
+ struct bpf_session_run_ctx *session_ctx;
+
+ session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx);
+ return session_ctx->is_return;
+}
+
+__bpf_kfunc __u64 *bpf_session_cookie(void)
+{
+ struct bpf_session_run_ctx *session_ctx;
+
+ session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx);
+ return session_ctx->data;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(kprobe_multi_kfunc_set_ids)
+BTF_ID_FLAGS(func, bpf_session_is_return)
+BTF_ID_FLAGS(func, bpf_session_cookie)
+BTF_KFUNCS_END(kprobe_multi_kfunc_set_ids)
+
+static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id)
+{
+ if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id))
+ return 0;
+
+ if (!is_kprobe_session(prog) && !is_uprobe_session(prog))
+ return -EACCES;
+
+ return 0;
+}
+
+static const struct btf_kfunc_id_set bpf_kprobe_multi_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &kprobe_multi_kfunc_set_ids,
+ .filter = bpf_kprobe_multi_filter,
+};
+
+static int __init bpf_kprobe_multi_kfuncs_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kprobe_multi_kfunc_set);
+}
+
+late_initcall(bpf_kprobe_multi_kfuncs_init);
+
+typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struct *tsk);
+
+/*
+ * The __always_inline is to make sure the compiler doesn't
+ * generate indirect calls into callbacks, which is expensive,
+ * on some kernel configurations. This allows compiler to put
+ * direct calls into all the specific callback implementations
+ * (copy_user_data_sleepable, copy_user_data_nofault, and so on)
+ */
+static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 doff, u32 size,
+ const void *unsafe_src,
+ copy_fn_t str_copy_fn,
+ struct task_struct *tsk)
+{
+ struct bpf_dynptr_kern *dst;
+ u32 chunk_sz, off;
+ void *dst_slice;
+ int cnt, err;
+ char buf[256];
+
+ dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size);
+ if (likely(dst_slice))
+ return str_copy_fn(dst_slice, unsafe_src, size, tsk);
+
+ dst = (struct bpf_dynptr_kern *)dptr;
+ if (bpf_dynptr_check_off_len(dst, doff, size))
+ return -E2BIG;
+
+ for (off = 0; off < size; off += chunk_sz - 1) {
+ chunk_sz = min_t(u32, sizeof(buf), size - off);
+ /* Expect str_copy_fn to return count of copied bytes, including
+ * zero terminator. Next iteration increment off by chunk_sz - 1 to
+ * overwrite NUL.
+ */
+ cnt = str_copy_fn(buf, unsafe_src + off, chunk_sz, tsk);
+ if (cnt < 0)
+ return cnt;
+ err = __bpf_dynptr_write(dst, doff + off, buf, cnt, 0);
+ if (err)
+ return err;
+ if (cnt < chunk_sz || chunk_sz == 1) /* we are done */
+ return off + cnt;
+ }
+ return off;
+}
+
+static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32 doff,
+ u32 size, const void *unsafe_src,
+ copy_fn_t copy_fn, struct task_struct *tsk)
+{
+ struct bpf_dynptr_kern *dst;
+ void *dst_slice;
+ char buf[256];
+ u32 off, chunk_sz;
+ int err;
+
+ dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size);
+ if (likely(dst_slice))
+ return copy_fn(dst_slice, unsafe_src, size, tsk);
+
+ dst = (struct bpf_dynptr_kern *)dptr;
+ if (bpf_dynptr_check_off_len(dst, doff, size))
+ return -E2BIG;
+
+ for (off = 0; off < size; off += chunk_sz) {
+ chunk_sz = min_t(u32, sizeof(buf), size - off);
+ err = copy_fn(buf, unsafe_src + off, chunk_sz, tsk);
+ if (err)
+ return err;
+ err = __bpf_dynptr_write(dst, doff + off, buf, chunk_sz, 0);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static __always_inline int copy_user_data_nofault(void *dst, const void *unsafe_src,
+ u32 size, struct task_struct *tsk)
+{
+ return copy_from_user_nofault(dst, (const void __user *)unsafe_src, size);
+}
+
+static __always_inline int copy_user_data_sleepable(void *dst, const void *unsafe_src,
+ u32 size, struct task_struct *tsk)
+{
+ int ret;
+
+ if (!tsk) { /* Read from the current task */
+ ret = copy_from_user(dst, (const void __user *)unsafe_src, size);
+ if (ret)
+ return -EFAULT;
+ return 0;
+ }
+
+ ret = access_process_vm(tsk, (unsigned long)unsafe_src, dst, size, 0);
+ if (ret != size)
+ return -EFAULT;
+ return 0;
+}
+
+static __always_inline int copy_kernel_data_nofault(void *dst, const void *unsafe_src,
+ u32 size, struct task_struct *tsk)
+{
+ return copy_from_kernel_nofault(dst, unsafe_src, size);
+}
+
+static __always_inline int copy_user_str_nofault(void *dst, const void *unsafe_src,
+ u32 size, struct task_struct *tsk)
+{
+ return strncpy_from_user_nofault(dst, (const void __user *)unsafe_src, size);
+}
+
+static __always_inline int copy_user_str_sleepable(void *dst, const void *unsafe_src,
+ u32 size, struct task_struct *tsk)
+{
+ int ret;
+
+ if (unlikely(size == 0))
+ return 0;
+
+ if (tsk) {
+ ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_src, dst, size, 0);
+ } else {
+ ret = strncpy_from_user(dst, (const void __user *)unsafe_src, size - 1);
+ /* strncpy_from_user does not guarantee NUL termination */
+ if (ret >= 0)
+ ((char *)dst)[ret] = '\0';
+ }
+
+ if (ret < 0)
+ return ret;
+ return ret + 1;
+}
+
+static __always_inline int copy_kernel_str_nofault(void *dst, const void *unsafe_src,
+ u32 size, struct task_struct *tsk)
+{
+ return strncpy_from_kernel_nofault(dst, unsafe_src, size);
+}
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid_type type,
+ u64 value)
+{
+ if (type != PIDTYPE_PID && type != PIDTYPE_TGID)
+ return -EINVAL;
+
+ return bpf_send_signal_common(sig, type, task, value);
+}
+
+__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u32 off,
+ u32 size, const void __user *unsafe_ptr__ign)
+{
+ return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
+ copy_user_data_nofault, NULL);
+}
+
+__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u32 off,
+ u32 size, const void *unsafe_ptr__ign)
+{
+ return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign,
+ copy_kernel_data_nofault, NULL);
+}
+
+__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u32 off,
+ u32 size, const void __user *unsafe_ptr__ign)
+{
+ return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
+ copy_user_str_nofault, NULL);
+}
+
+__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u32 off,
+ u32 size, const void *unsafe_ptr__ign)
+{
+ return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign,
+ copy_kernel_str_nofault, NULL);
+}
+
+__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u32 off,
+ u32 size, const void __user *unsafe_ptr__ign)
+{
+ return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
+ copy_user_data_sleepable, NULL);
+}
+
+__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u32 off,
+ u32 size, const void __user *unsafe_ptr__ign)
+{
+ return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
+ copy_user_str_sleepable, NULL);
+}
+
+__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u32 off,
+ u32 size, const void __user *unsafe_ptr__ign,
+ struct task_struct *tsk)
+{
+ return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
+ copy_user_data_sleepable, tsk);
+}
+
+__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u32 off,
+ u32 size, const void __user *unsafe_ptr__ign,
+ struct task_struct *tsk)
+{
+ return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
+ copy_user_str_sleepable, tsk);
+}
+
+__bpf_kfunc_end_defs();
diff --git a/kernel/trace/bpf_trace.h b/kernel/trace/bpf_trace.h
index 9acbc11ac7bb..c4075b56becc 100644
--- a/kernel/trace/bpf_trace.h
+++ b/kernel/trace/bpf_trace.h
@@ -19,7 +19,7 @@ TRACE_EVENT(bpf_trace_printk,
),
TP_fast_assign(
- __assign_str(bpf_string, bpf_string);
+ __assign_str(bpf_string);
),
TP_printk("%s", __get_str(bpf_string))
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index c83c005e654e..c5b207992fb4 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -7,9 +7,11 @@
*
* Highly modified by Steven Rostedt (VMware).
*/
+#include <linux/bits.h>
#include <linux/jump_label.h>
#include <linux/suspend.h>
#include <linux/ftrace.h>
+#include <linux/static_call.h>
#include <linux/slab.h>
#include <trace/events/sched.h>
@@ -17,19 +19,487 @@
#include "ftrace_internal.h"
#include "trace.h"
-#ifdef CONFIG_DYNAMIC_FTRACE
-#define ASSIGN_OPS_HASH(opsname, val) \
- .func_hash = val, \
- .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
-#else
-#define ASSIGN_OPS_HASH(opsname, val)
-#endif
+/*
+ * FGRAPH_FRAME_SIZE: Size in bytes of the meta data on the shadow stack
+ * FGRAPH_FRAME_OFFSET: Size in long words of the meta data frame
+ */
+#define FGRAPH_FRAME_SIZE sizeof(struct ftrace_ret_stack)
+#define FGRAPH_FRAME_OFFSET DIV_ROUND_UP(FGRAPH_FRAME_SIZE, sizeof(long))
+
+/*
+ * On entry to a function (via function_graph_enter()), a new fgraph frame
+ * (ftrace_ret_stack) is pushed onto the stack as well as a word that
+ * holds a bitmask and a type (called "bitmap"). The bitmap is defined as:
+ *
+ * bits: 0 - 9 offset in words from the previous ftrace_ret_stack
+ *
+ * bits: 10 - 11 Type of storage
+ * 0 - reserved
+ * 1 - bitmap of fgraph_array index
+ * 2 - reserved data
+ *
+ * For type with "bitmap of fgraph_array index" (FGRAPH_TYPE_BITMAP):
+ * bits: 12 - 27 The bitmap of fgraph_ops fgraph_array index
+ * That is, it's a bitmask of 0-15 (16 bits)
+ * where if a corresponding ops in the fgraph_array[]
+ * expects a callback from the return of the function
+ * it's corresponding bit will be set.
+ *
+ *
+ * The top of the ret_stack (when not empty) will always have a reference
+ * word that points to the last fgraph frame that was saved.
+ *
+ * For reserved data:
+ * bits: 12 - 17 The size in words that is stored
+ * bits: 18 - 23 The index of fgraph_array, which shows who is stored
+ *
+ * That is, at the end of function_graph_enter, if the first and forth
+ * fgraph_ops on the fgraph_array[] (index 0 and 3) needs their retfunc called
+ * on the return of the function being traced, and the forth fgraph_ops
+ * stored two words of data, this is what will be on the task's shadow
+ * ret_stack: (the stack grows upward)
+ *
+ * ret_stack[SHADOW_STACK_OFFSET]
+ * | SHADOW_STACK_TASK_VARS(ret_stack)[15] |
+ * ...
+ * | SHADOW_STACK_TASK_VARS(ret_stack)[0] |
+ * ret_stack[SHADOW_STACK_MAX_OFFSET]
+ * ...
+ * | | <- task->curr_ret_stack
+ * +--------------------------------------------+
+ * | (3 << 12) | (3 << 10) | FGRAPH_FRAME_OFFSET|
+ * | *or put another way* |
+ * | (3 << FGRAPH_DATA_INDEX_SHIFT)| \ | This is for fgraph_ops[3].
+ * | ((2 - 1) << FGRAPH_DATA_SHIFT)| \ | The data size is 2 words.
+ * | (FGRAPH_TYPE_DATA << FGRAPH_TYPE_SHIFT)| \ |
+ * | (offset2:FGRAPH_FRAME_OFFSET+3) | <- the offset2 is from here
+ * +--------------------------------------------+ ( It is 4 words from the ret_stack)
+ * | STORED DATA WORD 2 |
+ * | STORED DATA WORD 1 |
+ * +--------------------------------------------+
+ * | (9 << 12) | (1 << 10) | FGRAPH_FRAME_OFFSET|
+ * | *or put another way* |
+ * | (BIT(3)|BIT(0)) << FGRAPH_INDEX_SHIFT | \ |
+ * | FGRAPH_TYPE_BITMAP << FGRAPH_TYPE_SHIFT| \ |
+ * | (offset1:FGRAPH_FRAME_OFFSET) | <- the offset1 is from here
+ * +--------------------------------------------+
+ * | struct ftrace_ret_stack |
+ * | (stores the saved ret pointer) | <- the offset points here
+ * +--------------------------------------------+
+ * | (X) | (N) | ( N words away from
+ * | | previous ret_stack)
+ * ...
+ * ret_stack[0]
+ *
+ * If a backtrace is required, and the real return pointer needs to be
+ * fetched, then it looks at the task's curr_ret_stack offset, if it
+ * is greater than zero (reserved, or right before popped), it would mask
+ * the value by FGRAPH_FRAME_OFFSET_MASK to get the offset of the
+ * ftrace_ret_stack structure stored on the shadow stack.
+ */
+
+/*
+ * The following is for the top word on the stack:
+ *
+ * FGRAPH_FRAME_OFFSET (0-9) holds the offset delta to the fgraph frame
+ * FGRAPH_TYPE (10-11) holds the type of word this is.
+ * (RESERVED or BITMAP)
+ */
+#define FGRAPH_FRAME_OFFSET_BITS 10
+#define FGRAPH_FRAME_OFFSET_MASK GENMASK(FGRAPH_FRAME_OFFSET_BITS - 1, 0)
+
+#define FGRAPH_TYPE_BITS 2
+#define FGRAPH_TYPE_MASK GENMASK(FGRAPH_TYPE_BITS - 1, 0)
+#define FGRAPH_TYPE_SHIFT FGRAPH_FRAME_OFFSET_BITS
+
+enum {
+ FGRAPH_TYPE_RESERVED = 0,
+ FGRAPH_TYPE_BITMAP = 1,
+ FGRAPH_TYPE_DATA = 2,
+};
+
+/*
+ * For BITMAP type:
+ * FGRAPH_INDEX (12-27) bits holding the gops index wanting return callback called
+ */
+#define FGRAPH_INDEX_BITS 16
+#define FGRAPH_INDEX_MASK GENMASK(FGRAPH_INDEX_BITS - 1, 0)
+#define FGRAPH_INDEX_SHIFT (FGRAPH_TYPE_SHIFT + FGRAPH_TYPE_BITS)
+
+/*
+ * For DATA type:
+ * FGRAPH_DATA (12-17) bits hold the size of data (in words)
+ * FGRAPH_INDEX (18-23) bits hold the index for which gops->idx the data is for
+ *
+ * Note:
+ * data_size == 0 means 1 word, and 31 (=2^5 - 1) means 32 words.
+ */
+#define FGRAPH_DATA_BITS 5
+#define FGRAPH_DATA_MASK GENMASK(FGRAPH_DATA_BITS - 1, 0)
+#define FGRAPH_DATA_SHIFT (FGRAPH_TYPE_SHIFT + FGRAPH_TYPE_BITS)
+#define FGRAPH_MAX_DATA_SIZE (sizeof(long) * (1 << FGRAPH_DATA_BITS))
+
+#define FGRAPH_DATA_INDEX_BITS 4
+#define FGRAPH_DATA_INDEX_MASK GENMASK(FGRAPH_DATA_INDEX_BITS - 1, 0)
+#define FGRAPH_DATA_INDEX_SHIFT (FGRAPH_DATA_SHIFT + FGRAPH_DATA_BITS)
+
+#define FGRAPH_MAX_INDEX \
+ ((FGRAPH_INDEX_SIZE << FGRAPH_DATA_BITS) + FGRAPH_RET_INDEX)
+
+#define FGRAPH_ARRAY_SIZE FGRAPH_INDEX_BITS
+
+/*
+ * SHADOW_STACK_SIZE: The size in bytes of the entire shadow stack
+ * SHADOW_STACK_OFFSET: The size in long words of the shadow stack
+ * SHADOW_STACK_MAX_OFFSET: The max offset of the stack for a new frame to be added
+ */
+#define SHADOW_STACK_SIZE (4096)
+#define SHADOW_STACK_OFFSET (SHADOW_STACK_SIZE / sizeof(long))
+/* Leave on a buffer at the end */
+#define SHADOW_STACK_MAX_OFFSET \
+ (SHADOW_STACK_OFFSET - (FGRAPH_FRAME_OFFSET + 1 + FGRAPH_ARRAY_SIZE))
+
+/* RET_STACK(): Return the frame from a given @offset from task @t */
+#define RET_STACK(t, offset) ((struct ftrace_ret_stack *)(&(t)->ret_stack[offset]))
+
+/*
+ * Each fgraph_ops has a reservered unsigned long at the end (top) of the
+ * ret_stack to store task specific state.
+ */
+#define SHADOW_STACK_TASK_VARS(ret_stack) \
+ ((unsigned long *)(&(ret_stack)[SHADOW_STACK_OFFSET - FGRAPH_ARRAY_SIZE]))
DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph);
int ftrace_graph_active;
+static struct kmem_cache *fgraph_stack_cachep;
+
+static struct fgraph_ops *fgraph_array[FGRAPH_ARRAY_SIZE];
+static unsigned long fgraph_array_bitmask;
+
+/* LRU index table for fgraph_array */
+static int fgraph_lru_table[FGRAPH_ARRAY_SIZE];
+static int fgraph_lru_next;
+static int fgraph_lru_last;
+
+/* Initialize fgraph_lru_table with unused index */
+static void fgraph_lru_init(void)
+{
+ int i;
+
+ for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
+ fgraph_lru_table[i] = i;
+}
+
+/* Release the used index to the LRU table */
+static int fgraph_lru_release_index(int idx)
+{
+ if (idx < 0 || idx >= FGRAPH_ARRAY_SIZE ||
+ WARN_ON_ONCE(fgraph_lru_table[fgraph_lru_last] != -1))
+ return -1;
+
+ fgraph_lru_table[fgraph_lru_last] = idx;
+ fgraph_lru_last = (fgraph_lru_last + 1) % FGRAPH_ARRAY_SIZE;
+
+ clear_bit(idx, &fgraph_array_bitmask);
+ return 0;
+}
+
+/* Allocate a new index from LRU table */
+static int fgraph_lru_alloc_index(void)
+{
+ int idx = fgraph_lru_table[fgraph_lru_next];
+
+ /* No id is available */
+ if (idx == -1)
+ return -1;
+
+ fgraph_lru_table[fgraph_lru_next] = -1;
+ fgraph_lru_next = (fgraph_lru_next + 1) % FGRAPH_ARRAY_SIZE;
+
+ set_bit(idx, &fgraph_array_bitmask);
+ return idx;
+}
+
+/* Get the offset to the fgraph frame from a ret_stack value */
+static inline int __get_offset(unsigned long val)
+{
+ return val & FGRAPH_FRAME_OFFSET_MASK;
+}
+
+/* Get the type of word from a ret_stack value */
+static inline int __get_type(unsigned long val)
+{
+ return (val >> FGRAPH_TYPE_SHIFT) & FGRAPH_TYPE_MASK;
+}
+
+/* Get the data_index for a DATA type ret_stack word */
+static inline int __get_data_index(unsigned long val)
+{
+ return (val >> FGRAPH_DATA_INDEX_SHIFT) & FGRAPH_DATA_INDEX_MASK;
+}
+
+/* Get the data_size for a DATA type ret_stack word */
+static inline int __get_data_size(unsigned long val)
+{
+ return ((val >> FGRAPH_DATA_SHIFT) & FGRAPH_DATA_MASK) + 1;
+}
+
+/* Get the word from the ret_stack at @offset */
+static inline unsigned long get_fgraph_entry(struct task_struct *t, int offset)
+{
+ return t->ret_stack[offset];
+}
+
+/* Get the FRAME_OFFSET from the word from the @offset on ret_stack */
+static inline int get_frame_offset(struct task_struct *t, int offset)
+{
+ return __get_offset(t->ret_stack[offset]);
+}
+
+/* For BITMAP type: get the bitmask from the @offset at ret_stack */
+static inline unsigned long
+get_bitmap_bits(struct task_struct *t, int offset)
+{
+ return (t->ret_stack[offset] >> FGRAPH_INDEX_SHIFT) & FGRAPH_INDEX_MASK;
+}
+
+/* Write the bitmap to the ret_stack at @offset (does index, offset and bitmask) */
+static inline void
+set_bitmap(struct task_struct *t, int offset, unsigned long bitmap)
+{
+ t->ret_stack[offset] = (bitmap << FGRAPH_INDEX_SHIFT) |
+ (FGRAPH_TYPE_BITMAP << FGRAPH_TYPE_SHIFT) | FGRAPH_FRAME_OFFSET;
+}
+
+/* For DATA type: get the data saved under the ret_stack word at @offset */
+static inline void *get_data_type_data(struct task_struct *t, int offset)
+{
+ unsigned long val = t->ret_stack[offset];
+
+ if (__get_type(val) != FGRAPH_TYPE_DATA)
+ return NULL;
+ offset -= __get_data_size(val);
+ return (void *)&t->ret_stack[offset];
+}
+
+/* Create the ret_stack word for a DATA type */
+static inline unsigned long make_data_type_val(int idx, int size, int offset)
+{
+ return (idx << FGRAPH_DATA_INDEX_SHIFT) |
+ ((size - 1) << FGRAPH_DATA_SHIFT) |
+ (FGRAPH_TYPE_DATA << FGRAPH_TYPE_SHIFT) | offset;
+}
+
+/* ftrace_graph_entry set to this to tell some archs to run function graph */
+static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops,
+ struct ftrace_regs *fregs)
+{
+ return 0;
+}
+
+/* ftrace_graph_return set to this to tell some archs to run function graph */
+static void return_run(struct ftrace_graph_ret *trace, struct fgraph_ops *ops,
+ struct ftrace_regs *fregs)
+{
+}
+
+static void ret_stack_set_task_var(struct task_struct *t, int idx, long val)
+{
+ unsigned long *gvals = SHADOW_STACK_TASK_VARS(t->ret_stack);
+
+ gvals[idx] = val;
+}
+
+static unsigned long *
+ret_stack_get_task_var(struct task_struct *t, int idx)
+{
+ unsigned long *gvals = SHADOW_STACK_TASK_VARS(t->ret_stack);
+
+ return &gvals[idx];
+}
+
+static void ret_stack_init_task_vars(unsigned long *ret_stack)
+{
+ unsigned long *gvals = SHADOW_STACK_TASK_VARS(ret_stack);
+
+ memset(gvals, 0, sizeof(*gvals) * FGRAPH_ARRAY_SIZE);
+}
+
+/**
+ * fgraph_reserve_data - Reserve storage on the task's ret_stack
+ * @idx: The index of fgraph_array
+ * @size_bytes: The size in bytes to reserve
+ *
+ * Reserves space of up to FGRAPH_MAX_DATA_SIZE bytes on the
+ * task's ret_stack shadow stack, for a given fgraph_ops during
+ * the entryfunc() call. If entryfunc() returns zero, the storage
+ * is discarded. An entryfunc() can only call this once per iteration.
+ * The fgraph_ops retfunc() can retrieve this stored data with
+ * fgraph_retrieve_data().
+ *
+ * Returns: On success, a pointer to the data on the stack.
+ * Otherwise, NULL if there's not enough space left on the
+ * ret_stack for the data, or if fgraph_reserve_data() was called
+ * more than once for a single entryfunc() call.
+ */
+void *fgraph_reserve_data(int idx, int size_bytes)
+{
+ unsigned long val;
+ void *data;
+ int curr_ret_stack = current->curr_ret_stack;
+ int data_size;
+
+ if (size_bytes > FGRAPH_MAX_DATA_SIZE)
+ return NULL;
+
+ /* Convert the data size to number of longs. */
+ data_size = (size_bytes + sizeof(long) - 1) >> (sizeof(long) == 4 ? 2 : 3);
+
+ val = get_fgraph_entry(current, curr_ret_stack - 1);
+ data = &current->ret_stack[curr_ret_stack];
+
+ curr_ret_stack += data_size + 1;
+ if (unlikely(curr_ret_stack >= SHADOW_STACK_MAX_OFFSET))
+ return NULL;
+
+ val = make_data_type_val(idx, data_size, __get_offset(val) + data_size + 1);
+
+ /* Set the last word to be reserved */
+ current->ret_stack[curr_ret_stack - 1] = val;
+
+ /* Make sure interrupts see this */
+ barrier();
+ current->curr_ret_stack = curr_ret_stack;
+ /* Again sync with interrupts, and reset reserve */
+ current->ret_stack[curr_ret_stack - 1] = val;
+
+ return data;
+}
+
+/**
+ * fgraph_retrieve_data - Retrieve stored data from fgraph_reserve_data()
+ * @idx: the index of fgraph_array (fgraph_ops::idx)
+ * @size_bytes: pointer to retrieved data size.
+ *
+ * This is to be called by a fgraph_ops retfunc(), to retrieve data that
+ * was stored by the fgraph_ops entryfunc() on the function entry.
+ * That is, this will retrieve the data that was reserved on the
+ * entry of the function that corresponds to the exit of the function
+ * that the fgraph_ops retfunc() is called on.
+ *
+ * Returns: The stored data from fgraph_reserve_data() called by the
+ * matching entryfunc() for the retfunc() this is called from.
+ * Or NULL if there was nothing stored.
+ */
+void *fgraph_retrieve_data(int idx, int *size_bytes)
+{
+ return fgraph_retrieve_parent_data(idx, size_bytes, 0);
+}
+
+/**
+ * fgraph_get_task_var - retrieve a task specific state variable
+ * @gops: The ftrace_ops that owns the task specific variable
+ *
+ * Every registered fgraph_ops has a task state variable
+ * reserved on the task's ret_stack. This function returns the
+ * address to that variable.
+ *
+ * Returns the address to the fgraph_ops @gops tasks specific
+ * unsigned long variable.
+ */
+unsigned long *fgraph_get_task_var(struct fgraph_ops *gops)
+{
+ return ret_stack_get_task_var(current, gops->idx);
+}
+
+/*
+ * @offset: The offset into @t->ret_stack to find the ret_stack entry
+ * @frame_offset: Where to place the offset into @t->ret_stack of that entry
+ *
+ * Returns a pointer to the previous ret_stack below @offset or NULL
+ * when it reaches the bottom of the stack.
+ *
+ * Calling this with:
+ *
+ * offset = task->curr_ret_stack;
+ * do {
+ * ret_stack = get_ret_stack(task, offset, &offset);
+ * } while (ret_stack);
+ *
+ * Will iterate through all the ret_stack entries from curr_ret_stack
+ * down to the first one.
+ */
+static inline struct ftrace_ret_stack *
+get_ret_stack(struct task_struct *t, int offset, int *frame_offset)
+{
+ int offs;
+
+ BUILD_BUG_ON(FGRAPH_FRAME_SIZE % sizeof(long));
+
+ if (unlikely(offset <= 0))
+ return NULL;
+
+ offs = get_frame_offset(t, --offset);
+ if (WARN_ON_ONCE(offs <= 0 || offs > offset))
+ return NULL;
+
+ offset -= offs;
+
+ *frame_offset = offset;
+ return RET_STACK(t, offset);
+}
+
+/**
+ * fgraph_retrieve_parent_data - get data from a parent function
+ * @idx: The index into the fgraph_array (fgraph_ops::idx)
+ * @size_bytes: A pointer to retrieved data size
+ * @depth: The depth to find the parent (0 is the current function)
+ *
+ * This is similar to fgraph_retrieve_data() but can be used to retrieve
+ * data from a parent caller function.
+ *
+ * Return: a pointer to the specified parent data or NULL if not found
+ */
+void *fgraph_retrieve_parent_data(int idx, int *size_bytes, int depth)
+{
+ struct ftrace_ret_stack *ret_stack = NULL;
+ int offset = current->curr_ret_stack;
+ unsigned long val;
+
+ if (offset <= 0)
+ return NULL;
+
+ for (;;) {
+ int next_offset;
+
+ ret_stack = get_ret_stack(current, offset, &next_offset);
+ if (!ret_stack || --depth < 0)
+ break;
+ offset = next_offset;
+ }
+
+ if (!ret_stack)
+ return NULL;
+
+ offset--;
+
+ val = get_fgraph_entry(current, offset);
+ while (__get_type(val) == FGRAPH_TYPE_DATA) {
+ if (__get_data_index(val) == idx)
+ goto found;
+ offset -= __get_data_size(val) + 1;
+ val = get_fgraph_entry(current, offset);
+ }
+ return NULL;
+found:
+ if (size_bytes)
+ *size_bytes = __get_data_size(val) * sizeof(long);
+ return get_data_type_data(current, offset);
+}
+
/* Both enabled by default (can be cleared by function_graph tracer flags */
-static bool fgraph_sleep_time = true;
+bool fgraph_sleep_time = true;
#ifdef CONFIG_DYNAMIC_FTRACE
/*
@@ -51,6 +521,29 @@ int __weak ftrace_disable_ftrace_graph_caller(void)
}
#endif
+int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
+{
+ return 0;
+}
+
+static void ftrace_graph_ret_stub(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
+{
+}
+
+static struct fgraph_ops fgraph_stub = {
+ .entryfunc = ftrace_graph_entry_stub,
+ .retfunc = ftrace_graph_ret_stub,
+};
+
+static struct fgraph_ops *fgraph_direct_gops = &fgraph_stub;
+DEFINE_STATIC_CALL(fgraph_func, ftrace_graph_entry_stub);
+DEFINE_STATIC_CALL(fgraph_retfunc, ftrace_graph_ret_stub);
+static DEFINE_STATIC_KEY_TRUE(fgraph_do_direct);
+
/**
* ftrace_graph_stop - set to permanently disable function graph tracing
*
@@ -67,10 +560,12 @@ void ftrace_graph_stop(void)
/* Add a function return address to the trace stack on thread info.*/
static int
ftrace_push_return_trace(unsigned long ret, unsigned long func,
- unsigned long frame_pointer, unsigned long *retp)
+ unsigned long frame_pointer, unsigned long *retp,
+ int fgraph_idx)
{
- unsigned long long calltime;
- int index;
+ struct ftrace_ret_stack *ret_stack;
+ unsigned long val;
+ int offset;
if (unlikely(ftrace_graph_is_dead()))
return -EBUSY;
@@ -78,32 +573,64 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
if (!current->ret_stack)
return -EBUSY;
+ BUILD_BUG_ON(SHADOW_STACK_SIZE % sizeof(long));
+
+ /* Set val to "reserved" with the delta to the new fgraph frame */
+ val = (FGRAPH_TYPE_RESERVED << FGRAPH_TYPE_SHIFT) | FGRAPH_FRAME_OFFSET;
+
/*
* We must make sure the ret_stack is tested before we read
* anything else.
*/
smp_rmb();
- /* The return trace stack is full */
- if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
+ /*
+ * Check if there's room on the shadow stack to fit a fraph frame
+ * and a bitmap word.
+ */
+ if (current->curr_ret_stack + FGRAPH_FRAME_OFFSET + 1 >= SHADOW_STACK_MAX_OFFSET) {
atomic_inc(&current->trace_overrun);
return -EBUSY;
}
- calltime = trace_clock_local();
+ offset = READ_ONCE(current->curr_ret_stack);
+ ret_stack = RET_STACK(current, offset);
+ offset += FGRAPH_FRAME_OFFSET;
- index = ++current->curr_ret_stack;
+ /* ret offset = FGRAPH_FRAME_OFFSET ; type = reserved */
+ current->ret_stack[offset] = val;
+ ret_stack->ret = ret;
+ /*
+ * The unwinders expect curr_ret_stack to point to either zero
+ * or an offset where to find the next ret_stack. Even though the
+ * ret stack might be bogus, we want to write the ret and the
+ * offset to find the ret_stack before we increment the stack point.
+ * If an interrupt comes in now before we increment the curr_ret_stack
+ * it may blow away what we wrote. But that's fine, because the
+ * offset will still be correct (even though the 'ret' won't be).
+ * What we worry about is the offset being correct after we increment
+ * the curr_ret_stack and before we update that offset, as if an
+ * interrupt comes in and does an unwind stack dump, it will need
+ * at least a correct offset!
+ */
+ barrier();
+ WRITE_ONCE(current->curr_ret_stack, offset + 1);
+ /*
+ * This next barrier is to ensure that an interrupt coming in
+ * will not corrupt what we are about to write.
+ */
barrier();
- current->ret_stack[index].ret = ret;
- current->ret_stack[index].func = func;
- current->ret_stack[index].calltime = calltime;
+
+ /* Still keep it reserved even if an interrupt came in */
+ current->ret_stack[offset] = val;
+
+ ret_stack->ret = ret;
+ ret_stack->func = func;
#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
- current->ret_stack[index].fp = frame_pointer;
+ ret_stack->fp = frame_pointer;
#endif
-#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
- current->ret_stack[index].retp = retp;
-#endif
- return 0;
+ ret_stack->retp = retp;
+ return offset;
}
/*
@@ -120,55 +647,92 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
# define MCOUNT_INSN_SIZE 0
#endif
-int function_graph_enter(unsigned long ret, unsigned long func,
- unsigned long frame_pointer, unsigned long *retp)
+/* If the caller does not use ftrace, call this function. */
+int function_graph_enter_regs(unsigned long ret, unsigned long func,
+ unsigned long frame_pointer, unsigned long *retp,
+ struct ftrace_regs *fregs)
{
struct ftrace_graph_ent trace;
+ unsigned long bitmap = 0;
+ int offset;
+ int bit;
+ int i;
-#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
- /*
- * Skip graph tracing if the return location is served by direct trampoline,
- * since call sequence and return addresses are unpredictable anyway.
- * Ex: BPF trampoline may call original function and may skip frame
- * depending on type of BPF programs attached.
- */
- if (ftrace_direct_func_count &&
- ftrace_find_rec_direct(ret - MCOUNT_INSN_SIZE))
+ bit = ftrace_test_recursion_trylock(func, ret);
+ if (bit < 0)
return -EBUSY;
-#endif
+
trace.func = func;
trace.depth = ++current->curr_ret_depth;
- if (ftrace_push_return_trace(ret, func, frame_pointer, retp))
+ offset = ftrace_push_return_trace(ret, func, frame_pointer, retp, 0);
+ if (offset < 0)
goto out;
- /* Only trace if the calling function expects to */
- if (!ftrace_graph_entry(&trace))
+#ifdef CONFIG_HAVE_STATIC_CALL
+ if (static_branch_likely(&fgraph_do_direct)) {
+ int save_curr_ret_stack = current->curr_ret_stack;
+
+ if (static_call(fgraph_func)(&trace, fgraph_direct_gops, fregs))
+ bitmap |= BIT(fgraph_direct_gops->idx);
+ else
+ /* Clear out any saved storage */
+ current->curr_ret_stack = save_curr_ret_stack;
+ } else
+#endif
+ {
+ for_each_set_bit(i, &fgraph_array_bitmask,
+ sizeof(fgraph_array_bitmask) * BITS_PER_BYTE) {
+ struct fgraph_ops *gops = READ_ONCE(fgraph_array[i]);
+ int save_curr_ret_stack;
+
+ if (gops == &fgraph_stub)
+ continue;
+
+ save_curr_ret_stack = current->curr_ret_stack;
+ if (ftrace_ops_test(&gops->ops, func, NULL) &&
+ gops->entryfunc(&trace, gops, fregs))
+ bitmap |= BIT(i);
+ else
+ /* Clear out any saved storage */
+ current->curr_ret_stack = save_curr_ret_stack;
+ }
+ }
+
+ if (!bitmap)
goto out_ret;
+ /*
+ * Since this function uses fgraph_idx = 0 as a tail-call checking
+ * flag, set that bit always.
+ */
+ set_bitmap(current, offset, bitmap | BIT(0));
+ ftrace_test_recursion_unlock(bit);
return 0;
out_ret:
- current->curr_ret_stack--;
+ current->curr_ret_stack -= FGRAPH_FRAME_OFFSET + 1;
out:
current->curr_ret_depth--;
+ ftrace_test_recursion_unlock(bit);
return -EBUSY;
}
/* Retrieve a function return address to the trace stack on thread info.*/
-static void
+static struct ftrace_ret_stack *
ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
- unsigned long frame_pointer)
+ unsigned long frame_pointer, int *offset)
{
- int index;
+ struct ftrace_ret_stack *ret_stack;
- index = current->curr_ret_stack;
+ ret_stack = get_ret_stack(current, current->curr_ret_stack, offset);
- if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) {
+ if (unlikely(!ret_stack)) {
ftrace_graph_stop();
- WARN_ON(1);
+ WARN(1, "Bad function graph ret_stack pointer: %d",
+ current->curr_ret_stack);
/* Might as well panic, otherwise we have no where to go */
*ret = (unsigned long)panic;
- return;
+ return NULL;
}
#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
@@ -186,30 +750,32 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
* Note, -mfentry does not use frame pointers, and this test
* is not needed if CC_USING_FENTRY is set.
*/
- if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
+ if (unlikely(ret_stack->fp != frame_pointer)) {
ftrace_graph_stop();
WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
" from func %ps return to %lx\n",
- current->ret_stack[index].fp,
+ ret_stack->fp,
frame_pointer,
- (void *)current->ret_stack[index].func,
- current->ret_stack[index].ret);
+ (void *)ret_stack->func,
+ ret_stack->ret);
*ret = (unsigned long)panic;
- return;
+ return NULL;
}
#endif
- *ret = current->ret_stack[index].ret;
- trace->func = current->ret_stack[index].func;
- trace->calltime = current->ret_stack[index].calltime;
+ *offset += FGRAPH_FRAME_OFFSET;
+ *ret = ret_stack->ret;
+ trace->func = ret_stack->func;
trace->overrun = atomic_read(&current->trace_overrun);
- trace->depth = current->curr_ret_depth--;
+ trace->depth = current->curr_ret_depth;
/*
* We still want to trace interrupts coming in if
* max_depth is set to 1. Make sure the decrement is
* seen before ftrace_graph_return.
*/
barrier();
+
+ return ret_stack;
}
/*
@@ -237,52 +803,76 @@ static struct notifier_block ftrace_suspend_notifier = {
.notifier_call = ftrace_suspend_notifier_call,
};
-/* fgraph_ret_regs is not defined without CONFIG_FUNCTION_GRAPH_RETVAL */
-struct fgraph_ret_regs;
-
/*
* Send the trace to the ring-buffer.
* @return the original return address.
*/
-static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs,
- unsigned long frame_pointer)
+static inline unsigned long
+__ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointer)
{
+ struct ftrace_ret_stack *ret_stack;
struct ftrace_graph_ret trace;
+ unsigned long bitmap;
unsigned long ret;
+ int offset;
+ int i;
+
+ ret_stack = ftrace_pop_return_trace(&trace, &ret, frame_pointer, &offset);
+
+ if (unlikely(!ret_stack)) {
+ ftrace_graph_stop();
+ WARN_ON(1);
+ /* Might as well panic. What else to do? */
+ return (unsigned long)panic;
+ }
+
+ if (fregs)
+ ftrace_regs_set_instruction_pointer(fregs, ret);
- ftrace_pop_return_trace(&trace, &ret, frame_pointer);
#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
- trace.retval = fgraph_ret_regs_return_value(ret_regs);
+ trace.retval = ftrace_regs_get_return_value(fregs);
#endif
- trace.rettime = trace_clock_local();
- ftrace_graph_return(&trace);
+
+ bitmap = get_bitmap_bits(current, offset);
+
+#ifdef CONFIG_HAVE_STATIC_CALL
+ if (static_branch_likely(&fgraph_do_direct)) {
+ if (test_bit(fgraph_direct_gops->idx, &bitmap))
+ static_call(fgraph_retfunc)(&trace, fgraph_direct_gops, fregs);
+ } else
+#endif
+ {
+ for_each_set_bit(i, &bitmap, sizeof(bitmap) * BITS_PER_BYTE) {
+ struct fgraph_ops *gops = READ_ONCE(fgraph_array[i]);
+
+ if (gops == &fgraph_stub)
+ continue;
+
+ gops->retfunc(&trace, gops, fregs);
+ }
+ }
+
/*
* The ftrace_graph_return() may still access the current
* ret_stack structure, we need to make sure the update of
* curr_ret_stack is after that.
*/
barrier();
- current->curr_ret_stack--;
-
- if (unlikely(!ret)) {
- ftrace_graph_stop();
- WARN_ON(1);
- /* Might as well panic. What else to do? */
- ret = (unsigned long)panic;
- }
+ current->curr_ret_stack = offset - FGRAPH_FRAME_OFFSET;
+ current->curr_ret_depth--;
return ret;
}
/*
- * After all architecures have selected HAVE_FUNCTION_GRAPH_RETVAL, we can
- * leave only ftrace_return_to_handler(ret_regs).
+ * After all architectures have selected HAVE_FUNCTION_GRAPH_FREGS, we can
+ * leave only ftrace_return_to_handler(fregs).
*/
-#ifdef CONFIG_HAVE_FUNCTION_GRAPH_RETVAL
-unsigned long ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs)
+#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FREGS
+unsigned long ftrace_return_to_handler(struct ftrace_regs *fregs)
{
- return __ftrace_return_to_handler(ret_regs,
- fgraph_ret_regs_frame_pointer(ret_regs));
+ return __ftrace_return_to_handler(fregs,
+ ftrace_regs_get_frame_pointer(fregs));
}
#else
unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
@@ -293,7 +883,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
/**
* ftrace_graph_get_ret_stack - return the entry of the shadow stack
- * @task: The task to read the shadow stack from
+ * @task: The task to read the shadow stack from.
* @idx: Index down the shadow stack
*
* Return the ret_struct on the shadow stack of the @task at the
@@ -305,115 +895,151 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
struct ftrace_ret_stack *
ftrace_graph_get_ret_stack(struct task_struct *task, int idx)
{
- idx = task->curr_ret_stack - idx;
+ struct ftrace_ret_stack *ret_stack = NULL;
+ int offset = task->curr_ret_stack;
- if (idx >= 0 && idx <= task->curr_ret_stack)
- return &task->ret_stack[idx];
+ if (offset < 0)
+ return NULL;
- return NULL;
+ do {
+ ret_stack = get_ret_stack(task, offset, &offset);
+ } while (ret_stack && --idx >= 0);
+
+ return ret_stack;
}
/**
- * ftrace_graph_ret_addr - convert a potentially modified stack return address
- * to its original value
+ * ftrace_graph_top_ret_addr - return the top return address in the shadow stack
+ * @task: The task to read the shadow stack from.
+ *
+ * Return the first return address on the shadow stack of the @task, which is
+ * not the fgraph's return_to_handler.
+ */
+unsigned long ftrace_graph_top_ret_addr(struct task_struct *task)
+{
+ unsigned long return_handler = (unsigned long)dereference_kernel_function_descriptor(return_to_handler);
+ struct ftrace_ret_stack *ret_stack = NULL;
+ int offset = task->curr_ret_stack;
+
+ if (offset < 0)
+ return 0;
+
+ do {
+ ret_stack = get_ret_stack(task, offset, &offset);
+ } while (ret_stack && ret_stack->ret == return_handler);
+
+ return ret_stack ? ret_stack->ret : 0;
+}
+
+/**
+ * ftrace_graph_ret_addr - return the original value of the return address
+ * @task: The task the unwinder is being executed on
+ * @idx: An initialized pointer to the next stack index to use
+ * @ret: The current return address (likely pointing to return_handler)
+ * @retp: The address on the stack of the current return location
*
* This function can be called by stack unwinding code to convert a found stack
- * return address ('ret') to its original value, in case the function graph
+ * return address (@ret) to its original value, in case the function graph
* tracer has modified it to be 'return_to_handler'. If the address hasn't
- * been modified, the unchanged value of 'ret' is returned.
+ * been modified, the unchanged value of @ret is returned.
*
- * 'idx' is a state variable which should be initialized by the caller to zero
- * before the first call.
+ * @idx holds the last index used to know where to start from. It should be
+ * initialized to zero for the first iteration as that will mean to start
+ * at the top of the shadow stack. If the location is found, this pointer
+ * will be assigned that location so that if called again, it will continue
+ * where it left off.
*
- * 'retp' is a pointer to the return address on the stack. It's ignored if
- * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined.
+ * @retp is a pointer to the return address on the stack.
*/
-#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
unsigned long ret, unsigned long *retp)
{
- int index = task->curr_ret_stack;
+ struct ftrace_ret_stack *ret_stack;
+ unsigned long return_handler = (unsigned long)dereference_kernel_function_descriptor(return_to_handler);
int i;
- if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler))
+ if (ret != return_handler)
return ret;
- if (index < 0)
+ if (!idx)
return ret;
- for (i = 0; i <= index; i++)
- if (task->ret_stack[i].retp == retp)
- return task->ret_stack[i].ret;
+ i = *idx ? : task->curr_ret_stack;
+ while (i > 0) {
+ ret_stack = get_ret_stack(task, i, &i);
+ if (!ret_stack)
+ break;
+ /*
+ * For the tail-call, there would be 2 or more ftrace_ret_stacks on
+ * the ret_stack, which records "return_to_handler" as the return
+ * address except for the last one.
+ * But on the real stack, there should be 1 entry because tail-call
+ * reuses the return address on the stack and jump to the next function.
+ * Thus we will continue to find real return address.
+ */
+ if (ret_stack->retp == retp &&
+ ret_stack->ret != return_handler) {
+ *idx = i;
+ return ret_stack->ret;
+ }
+ }
return ret;
}
-#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
-unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
- unsigned long ret, unsigned long *retp)
-{
- int task_idx;
-
- if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler))
- return ret;
-
- task_idx = task->curr_ret_stack;
-
- if (!task->ret_stack || task_idx < *idx)
- return ret;
-
- task_idx -= *idx;
- (*idx)++;
-
- return task->ret_stack[task_idx].ret;
-}
-#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
static struct ftrace_ops graph_ops = {
.func = ftrace_graph_func,
- .flags = FTRACE_OPS_FL_INITIALIZED |
- FTRACE_OPS_FL_PID |
- FTRACE_OPS_GRAPH_STUB,
+ .flags = FTRACE_OPS_GRAPH_STUB,
#ifdef FTRACE_GRAPH_TRAMP_ADDR
.trampoline = FTRACE_GRAPH_TRAMP_ADDR,
/* trampoline_size is only needed for dynamically allocated tramps */
#endif
- ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
};
-void ftrace_graph_sleep_time_control(bool enable)
+void fgraph_init_ops(struct ftrace_ops *dst_ops,
+ struct ftrace_ops *src_ops)
{
- fgraph_sleep_time = enable;
+ dst_ops->flags = FTRACE_OPS_FL_PID | FTRACE_OPS_GRAPH_STUB;
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+ if (src_ops) {
+ dst_ops->func_hash = &src_ops->local_hash;
+ mutex_init(&dst_ops->local_hash.regex_lock);
+ INIT_LIST_HEAD(&dst_ops->subop_list);
+ dst_ops->flags |= FTRACE_OPS_FL_INITIALIZED;
+ }
+#endif
}
-int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
+void ftrace_graph_sleep_time_control(bool enable)
{
- return 0;
+ fgraph_sleep_time = enable;
}
/*
* Simply points to ftrace_stub, but with the proper protocol.
* Defined by the linker script in linux/vmlinux.lds.h
*/
-extern void ftrace_stub_graph(struct ftrace_graph_ret *);
+void ftrace_stub_graph(struct ftrace_graph_ret *trace, struct fgraph_ops *gops,
+ struct ftrace_regs *fregs);
/* The callbacks that hook a function */
trace_func_graph_ret_t ftrace_graph_return = ftrace_stub_graph;
trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
-static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub;
/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
-static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
+static int alloc_retstack_tasklist(unsigned long **ret_stack_list)
{
int i;
int ret = 0;
int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
struct task_struct *g, *t;
+ if (WARN_ON_ONCE(!fgraph_stack_cachep))
+ return -ENOMEM;
+
for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
- ret_stack_list[i] =
- kmalloc_array(FTRACE_RETFUNC_DEPTH,
- sizeof(struct ftrace_ret_stack),
- GFP_KERNEL);
+ ret_stack_list[i] = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL);
if (!ret_stack_list[i]) {
start = 0;
end = i;
@@ -431,9 +1057,10 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
if (t->ret_stack == NULL) {
atomic_set(&t->trace_overrun, 0);
- t->curr_ret_stack = -1;
+ ret_stack_init_task_vars(ret_stack_list[start]);
+ t->curr_ret_stack = 0;
t->curr_ret_depth = -1;
- /* Make sure the tasks see the -1 first: */
+ /* Make sure the tasks see the 0 first: */
smp_wmb();
t->ret_stack = ret_stack_list[start++];
}
@@ -443,7 +1070,7 @@ unlock:
rcu_read_unlock();
free:
for (i = start; i < end; i++)
- kfree(ret_stack_list[i]);
+ kmem_cache_free(fgraph_stack_cachep, ret_stack_list[i]);
return ret;
}
@@ -454,7 +1081,6 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
unsigned int prev_state)
{
unsigned long long timestamp;
- int index;
/*
* Does the user want to count the time a function was asleep.
@@ -471,63 +1097,19 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
if (!next->ftrace_timestamp)
return;
- /*
- * Update all the counters in next to make up for the
- * time next was sleeping.
- */
- timestamp -= next->ftrace_timestamp;
-
- for (index = next->curr_ret_stack; index >= 0; index--)
- next->ret_stack[index].calltime += timestamp;
+ next->ftrace_sleeptime += timestamp - next->ftrace_timestamp;
}
-static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
-{
- if (!ftrace_ops_test(&global_ops, trace->func, NULL))
- return 0;
- return __ftrace_graph_entry(trace);
-}
-
-/*
- * The function graph tracer should only trace the functions defined
- * by set_ftrace_filter and set_ftrace_notrace. If another function
- * tracer ops is registered, the graph tracer requires testing the
- * function against the global ops, and not just trace any function
- * that any ftrace_ops registered.
- */
-void update_function_graph_func(void)
-{
- struct ftrace_ops *op;
- bool do_test = false;
-
- /*
- * The graph and global ops share the same set of functions
- * to test. If any other ops is on the list, then
- * the graph tracing needs to test if its the function
- * it should call.
- */
- do_for_each_ftrace_op(op, ftrace_ops_list) {
- if (op != &global_ops && op != &graph_ops &&
- op != &ftrace_list_end) {
- do_test = true;
- /* in double loop, break out with goto */
- goto out;
- }
- } while_for_each_ftrace_op(op);
- out:
- if (do_test)
- ftrace_graph_entry = ftrace_graph_entry_test;
- else
- ftrace_graph_entry = __ftrace_graph_entry;
-}
-
-static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
+static DEFINE_PER_CPU(unsigned long *, idle_ret_stack);
static void
-graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
+graph_init_task(struct task_struct *t, unsigned long *ret_stack)
{
atomic_set(&t->trace_overrun, 0);
+ ret_stack_init_task_vars(ret_stack);
t->ftrace_timestamp = 0;
+ t->curr_ret_stack = 0;
+ t->curr_ret_depth = -1;
/* make curr_ret_stack visible before we add the ret_stack */
smp_wmb();
t->ret_stack = ret_stack;
@@ -539,7 +1121,7 @@ graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
*/
void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
{
- t->curr_ret_stack = -1;
+ t->curr_ret_stack = 0;
t->curr_ret_depth = -1;
/*
* The idle task has no parent, it either has its own
@@ -549,14 +1131,14 @@ void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
if (ftrace_graph_active) {
- struct ftrace_ret_stack *ret_stack;
+ unsigned long *ret_stack;
+
+ if (WARN_ON_ONCE(!fgraph_stack_cachep))
+ return;
ret_stack = per_cpu(idle_ret_stack, cpu);
if (!ret_stack) {
- ret_stack =
- kmalloc_array(FTRACE_RETFUNC_DEPTH,
- sizeof(struct ftrace_ret_stack),
- GFP_KERNEL);
+ ret_stack = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL);
if (!ret_stack)
return;
per_cpu(idle_ret_stack, cpu) = ret_stack;
@@ -570,15 +1152,16 @@ void ftrace_graph_init_task(struct task_struct *t)
{
/* Make sure we do not use the parent ret_stack */
t->ret_stack = NULL;
- t->curr_ret_stack = -1;
+ t->curr_ret_stack = 0;
t->curr_ret_depth = -1;
if (ftrace_graph_active) {
- struct ftrace_ret_stack *ret_stack;
+ unsigned long *ret_stack;
- ret_stack = kmalloc_array(FTRACE_RETFUNC_DEPTH,
- sizeof(struct ftrace_ret_stack),
- GFP_KERNEL);
+ if (WARN_ON_ONCE(!fgraph_stack_cachep))
+ return;
+
+ ret_stack = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL);
if (!ret_stack)
return;
graph_init_task(t, ret_stack);
@@ -587,24 +1170,67 @@ void ftrace_graph_init_task(struct task_struct *t)
void ftrace_graph_exit_task(struct task_struct *t)
{
- struct ftrace_ret_stack *ret_stack = t->ret_stack;
+ unsigned long *ret_stack = t->ret_stack;
t->ret_stack = NULL;
/* NULL must become visible to IRQs before we free it: */
barrier();
- kfree(ret_stack);
+ if (ret_stack) {
+ if (WARN_ON_ONCE(!fgraph_stack_cachep))
+ return;
+ kmem_cache_free(fgraph_stack_cachep, ret_stack);
+ }
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+static int fgraph_pid_func(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
+{
+ struct trace_array *tr = gops->ops.private;
+ int pid;
+
+ if (tr) {
+ pid = this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid);
+ if (pid == FTRACE_PID_IGNORE)
+ return 0;
+ if (pid != FTRACE_PID_TRACE &&
+ pid != current->pid)
+ return 0;
+ }
+
+ return gops->saved_func(trace, gops, fregs);
+}
+
+void fgraph_update_pid_func(void)
+{
+ struct fgraph_ops *gops;
+ struct ftrace_ops *op;
+
+ if (!(graph_ops.flags & FTRACE_OPS_FL_INITIALIZED))
+ return;
+
+ list_for_each_entry(op, &graph_ops.subop_list, list) {
+ if (op->flags & FTRACE_OPS_FL_PID) {
+ gops = container_of(op, struct fgraph_ops, ops);
+ gops->entryfunc = ftrace_pids_enabled(op) ?
+ fgraph_pid_func : gops->saved_func;
+ if (ftrace_graph_active == 1)
+ static_call_update(fgraph_func, gops->entryfunc);
+ }
+ }
}
+#endif
/* Allocate a return stack for each task */
static int start_graph_tracing(void)
{
- struct ftrace_ret_stack **ret_stack_list;
+ unsigned long **ret_stack_list;
int ret, cpu;
- ret_stack_list = kmalloc_array(FTRACE_RETSTACK_ALLOC_SIZE,
- sizeof(struct ftrace_ret_stack *),
- GFP_KERNEL);
+ ret_stack_list = kcalloc(FTRACE_RETSTACK_ALLOC_SIZE,
+ sizeof(*ret_stack_list), GFP_KERNEL);
if (!ret_stack_list)
return -ENOMEM;
@@ -630,60 +1256,182 @@ static int start_graph_tracing(void)
return ret;
}
+static void init_task_vars(int idx)
+{
+ struct task_struct *g, *t;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ if (idle_task(cpu)->ret_stack)
+ ret_stack_set_task_var(idle_task(cpu), idx, 0);
+ }
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, t) {
+ if (t->ret_stack)
+ ret_stack_set_task_var(t, idx, 0);
+ }
+ read_unlock(&tasklist_lock);
+}
+
+static void ftrace_graph_enable_direct(bool enable_branch, struct fgraph_ops *gops)
+{
+ trace_func_graph_ent_t func = NULL;
+ trace_func_graph_ret_t retfunc = NULL;
+ int i;
+
+ if (gops) {
+ func = gops->entryfunc;
+ retfunc = gops->retfunc;
+ fgraph_direct_gops = gops;
+ } else {
+ for_each_set_bit(i, &fgraph_array_bitmask,
+ sizeof(fgraph_array_bitmask) * BITS_PER_BYTE) {
+ func = fgraph_array[i]->entryfunc;
+ retfunc = fgraph_array[i]->retfunc;
+ fgraph_direct_gops = fgraph_array[i];
+ }
+ }
+ if (WARN_ON_ONCE(!func))
+ return;
+
+ static_call_update(fgraph_func, func);
+ static_call_update(fgraph_retfunc, retfunc);
+ if (enable_branch)
+ static_branch_disable(&fgraph_do_direct);
+}
+
+static void ftrace_graph_disable_direct(bool disable_branch)
+{
+ if (disable_branch)
+ static_branch_disable(&fgraph_do_direct);
+ static_call_update(fgraph_func, ftrace_graph_entry_stub);
+ static_call_update(fgraph_retfunc, ftrace_graph_ret_stub);
+ fgraph_direct_gops = &fgraph_stub;
+}
+
+/* The cpu_boot init_task->ret_stack will never be freed */
+static int fgraph_cpu_init(unsigned int cpu)
+{
+ if (!idle_task(cpu)->ret_stack)
+ ftrace_graph_init_idle_task(idle_task(cpu), cpu);
+ return 0;
+}
+
int register_ftrace_graph(struct fgraph_ops *gops)
{
+ static bool fgraph_initialized;
+ int command = 0;
int ret = 0;
+ int i = -1;
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
- /* we currently allow only one tracer registered at a time */
- if (ftrace_graph_active) {
- ret = -EBUSY;
- goto out;
+ if (!fgraph_stack_cachep) {
+ fgraph_stack_cachep = kmem_cache_create("fgraph_stack",
+ SHADOW_STACK_SIZE,
+ SHADOW_STACK_SIZE, 0, NULL);
+ if (!fgraph_stack_cachep)
+ return -ENOMEM;
}
- register_pm_notifier(&ftrace_suspend_notifier);
+ if (!fgraph_initialized) {
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "fgraph:online",
+ fgraph_cpu_init, NULL);
+ if (ret < 0) {
+ pr_warn("fgraph: Error to init cpu hotplug support\n");
+ return ret;
+ }
+ fgraph_initialized = true;
+ ret = 0;
+ }
+
+ if (!fgraph_array[0]) {
+ /* The array must always have real data on it */
+ for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
+ fgraph_array[i] = &fgraph_stub;
+ fgraph_lru_init();
+ }
+
+ i = fgraph_lru_alloc_index();
+ if (i < 0 || WARN_ON_ONCE(fgraph_array[i] != &fgraph_stub))
+ return -ENOSPC;
+ gops->idx = i;
ftrace_graph_active++;
- ret = start_graph_tracing();
- if (ret) {
- ftrace_graph_active--;
- goto out;
+
+ if (ftrace_graph_active == 2)
+ ftrace_graph_disable_direct(true);
+
+ if (ftrace_graph_active == 1) {
+ ftrace_graph_enable_direct(false, gops);
+ register_pm_notifier(&ftrace_suspend_notifier);
+ ret = start_graph_tracing();
+ if (ret)
+ goto error;
+ /*
+ * Some archs just test to see if these are not
+ * the default function
+ */
+ ftrace_graph_return = return_run;
+ ftrace_graph_entry = entry_run;
+ command = FTRACE_START_FUNC_RET;
+ } else {
+ init_task_vars(gops->idx);
}
+ /* Always save the function, and reset at unregistering */
+ gops->saved_func = gops->entryfunc;
- ftrace_graph_return = gops->retfunc;
+ gops->ops.flags |= FTRACE_OPS_FL_GRAPH;
- /*
- * Update the indirect function to the entryfunc, and the
- * function that gets called to the entry_test first. Then
- * call the update fgraph entry function to determine if
- * the entryfunc should be called directly or not.
- */
- __ftrace_graph_entry = gops->entryfunc;
- ftrace_graph_entry = ftrace_graph_entry_test;
- update_function_graph_func();
+ ret = ftrace_startup_subops(&graph_ops, &gops->ops, command);
+ if (!ret)
+ fgraph_array[i] = gops;
- ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
-out:
- mutex_unlock(&ftrace_lock);
+error:
+ if (ret) {
+ ftrace_graph_active--;
+ gops->saved_func = NULL;
+ fgraph_lru_release_index(i);
+ }
return ret;
}
void unregister_ftrace_graph(struct fgraph_ops *gops)
{
- mutex_lock(&ftrace_lock);
+ int command = 0;
+
+ guard(mutex)(&ftrace_lock);
if (unlikely(!ftrace_graph_active))
- goto out;
+ return;
+
+ if (unlikely(gops->idx < 0 || gops->idx >= FGRAPH_ARRAY_SIZE ||
+ fgraph_array[gops->idx] != gops))
+ return;
+
+ if (fgraph_lru_release_index(gops->idx) < 0)
+ return;
+
+ fgraph_array[gops->idx] = &fgraph_stub;
ftrace_graph_active--;
- ftrace_graph_return = ftrace_stub_graph;
- ftrace_graph_entry = ftrace_graph_entry_stub;
- __ftrace_graph_entry = ftrace_graph_entry_stub;
- ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET);
- unregister_pm_notifier(&ftrace_suspend_notifier);
- unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
- out:
- mutex_unlock(&ftrace_lock);
+ if (!ftrace_graph_active)
+ command = FTRACE_STOP_FUNC_RET;
+
+ ftrace_shutdown_subops(&graph_ops, &gops->ops, command);
+
+ if (ftrace_graph_active == 1)
+ ftrace_graph_enable_direct(true, NULL);
+ else if (!ftrace_graph_active)
+ ftrace_graph_disable_direct(false);
+
+ if (!ftrace_graph_active) {
+ ftrace_graph_return = ftrace_stub_graph;
+ ftrace_graph_entry = ftrace_graph_entry_stub;
+ unregister_pm_notifier(&ftrace_suspend_notifier);
+ unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
+ }
+ gops->saved_func = NULL;
}
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 9ff018245840..ba7ff14f5339 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -8,98 +8,227 @@
#include <linux/fprobe.h>
#include <linux/kallsyms.h>
#include <linux/kprobes.h>
-#include <linux/rethook.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/sort.h>
+#include <asm/fprobe.h>
+
#include "trace.h"
-struct fprobe_rethook_node {
- struct rethook_node node;
- unsigned long entry_ip;
- unsigned long entry_parent_ip;
- char data[];
-};
+#define FPROBE_IP_HASH_BITS 8
+#define FPROBE_IP_TABLE_SIZE (1 << FPROBE_IP_HASH_BITS)
-static inline void __fprobe_handler(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct ftrace_regs *fregs)
-{
- struct fprobe_rethook_node *fpr;
- struct rethook_node *rh = NULL;
- struct fprobe *fp;
- void *entry_data = NULL;
- int ret = 0;
+#define FPROBE_HASH_BITS 6
+#define FPROBE_TABLE_SIZE (1 << FPROBE_HASH_BITS)
- fp = container_of(ops, struct fprobe, ops);
+#define SIZE_IN_LONG(x) ((x + sizeof(long) - 1) >> (sizeof(long) == 8 ? 3 : 2))
- if (fp->exit_handler) {
- rh = rethook_try_get(fp->rethook);
- if (!rh) {
- fp->nmissed++;
- return;
- }
- fpr = container_of(rh, struct fprobe_rethook_node, node);
- fpr->entry_ip = ip;
- fpr->entry_parent_ip = parent_ip;
- if (fp->entry_data_size)
- entry_data = fpr->data;
+/*
+ * fprobe_table: hold 'fprobe_hlist::hlist' for checking the fprobe still
+ * exists. The key is the address of fprobe instance.
+ * fprobe_ip_table: hold 'fprobe_hlist::array[*]' for searching the fprobe
+ * instance related to the funciton address. The key is the ftrace IP
+ * address.
+ *
+ * When unregistering the fprobe, fprobe_hlist::fp and fprobe_hlist::array[*].fp
+ * are set NULL and delete those from both hash tables (by hlist_del_rcu).
+ * After an RCU grace period, the fprobe_hlist itself will be released.
+ *
+ * fprobe_table and fprobe_ip_table can be accessed from either
+ * - Normal hlist traversal and RCU add/del under 'fprobe_mutex' is held.
+ * - RCU hlist traversal under disabling preempt
+ */
+static struct hlist_head fprobe_table[FPROBE_TABLE_SIZE];
+static struct hlist_head fprobe_ip_table[FPROBE_IP_TABLE_SIZE];
+static DEFINE_MUTEX(fprobe_mutex);
+
+/*
+ * Find first fprobe in the hlist. It will be iterated twice in the entry
+ * probe, once for correcting the total required size, the second time is
+ * calling back the user handlers.
+ * Thus the hlist in the fprobe_table must be sorted and new probe needs to
+ * be added *before* the first fprobe.
+ */
+static struct fprobe_hlist_node *find_first_fprobe_node(unsigned long ip)
+{
+ struct fprobe_hlist_node *node;
+ struct hlist_head *head;
+
+ head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)];
+ hlist_for_each_entry_rcu(node, head, hlist,
+ lockdep_is_held(&fprobe_mutex)) {
+ if (node->addr == ip)
+ return node;
}
+ return NULL;
+}
+NOKPROBE_SYMBOL(find_first_fprobe_node);
- if (fp->entry_handler)
- ret = fp->entry_handler(fp, ip, parent_ip, ftrace_get_regs(fregs), entry_data);
+/* Node insertion and deletion requires the fprobe_mutex */
+static void insert_fprobe_node(struct fprobe_hlist_node *node)
+{
+ unsigned long ip = node->addr;
+ struct fprobe_hlist_node *next;
+ struct hlist_head *head;
- /* If entry_handler returns !0, nmissed is not counted. */
- if (rh) {
- if (ret)
- rethook_recycle(rh);
- else
- rethook_hook(rh, ftrace_get_regs(fregs), true);
+ lockdep_assert_held(&fprobe_mutex);
+
+ next = find_first_fprobe_node(ip);
+ if (next) {
+ hlist_add_before_rcu(&node->hlist, &next->hlist);
+ return;
}
+ head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)];
+ hlist_add_head_rcu(&node->hlist, head);
}
-static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct ftrace_regs *fregs)
+/* Return true if there are synonims */
+static bool delete_fprobe_node(struct fprobe_hlist_node *node)
{
- struct fprobe *fp;
- int bit;
+ lockdep_assert_held(&fprobe_mutex);
- fp = container_of(ops, struct fprobe, ops);
- if (fprobe_disabled(fp))
- return;
+ /* Avoid double deleting */
+ if (READ_ONCE(node->fp) != NULL) {
+ WRITE_ONCE(node->fp, NULL);
+ hlist_del_rcu(&node->hlist);
+ }
+ return !!find_first_fprobe_node(node->addr);
+}
- /* recursion detection has to go before any traceable function and
- * all functions before this point should be marked as notrace
- */
- bit = ftrace_test_recursion_trylock(ip, parent_ip);
- if (bit < 0) {
- fp->nmissed++;
- return;
+/* Check existence of the fprobe */
+static bool is_fprobe_still_exist(struct fprobe *fp)
+{
+ struct hlist_head *head;
+ struct fprobe_hlist *fph;
+
+ head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)];
+ hlist_for_each_entry_rcu(fph, head, hlist,
+ lockdep_is_held(&fprobe_mutex)) {
+ if (fph->fp == fp)
+ return true;
}
- __fprobe_handler(ip, parent_ip, ops, fregs);
- ftrace_test_recursion_unlock(bit);
+ return false;
+}
+NOKPROBE_SYMBOL(is_fprobe_still_exist);
+
+static int add_fprobe_hash(struct fprobe *fp)
+{
+ struct fprobe_hlist *fph = fp->hlist_array;
+ struct hlist_head *head;
+
+ lockdep_assert_held(&fprobe_mutex);
+
+ if (WARN_ON_ONCE(!fph))
+ return -EINVAL;
+ if (is_fprobe_still_exist(fp))
+ return -EEXIST;
+
+ head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)];
+ hlist_add_head_rcu(&fp->hlist_array->hlist, head);
+ return 0;
}
-NOKPROBE_SYMBOL(fprobe_handler);
-static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct ftrace_regs *fregs)
+static int del_fprobe_hash(struct fprobe *fp)
{
+ struct fprobe_hlist *fph = fp->hlist_array;
+
+ lockdep_assert_held(&fprobe_mutex);
+
+ if (WARN_ON_ONCE(!fph))
+ return -EINVAL;
+
+ if (!is_fprobe_still_exist(fp))
+ return -ENOENT;
+
+ fph->fp = NULL;
+ hlist_del_rcu(&fph->hlist);
+ return 0;
+}
+
+#ifdef ARCH_DEFINE_ENCODE_FPROBE_HEADER
+
+/* The arch should encode fprobe_header info into one unsigned long */
+#define FPROBE_HEADER_SIZE_IN_LONG 1
+
+static inline bool write_fprobe_header(unsigned long *stack,
+ struct fprobe *fp, unsigned int size_words)
+{
+ if (WARN_ON_ONCE(size_words > MAX_FPROBE_DATA_SIZE_WORD ||
+ !arch_fprobe_header_encodable(fp)))
+ return false;
+
+ *stack = arch_encode_fprobe_header(fp, size_words);
+ return true;
+}
+
+static inline void read_fprobe_header(unsigned long *stack,
+ struct fprobe **fp, unsigned int *size_words)
+{
+ *fp = arch_decode_fprobe_header_fp(*stack);
+ *size_words = arch_decode_fprobe_header_size(*stack);
+}
+
+#else
+
+/* Generic fprobe_header */
+struct __fprobe_header {
struct fprobe *fp;
- int bit;
+ unsigned long size_words;
+} __packed;
- fp = container_of(ops, struct fprobe, ops);
- if (fprobe_disabled(fp))
- return;
+#define FPROBE_HEADER_SIZE_IN_LONG SIZE_IN_LONG(sizeof(struct __fprobe_header))
- /* recursion detection has to go before any traceable function and
- * all functions called before this point should be marked as notrace
- */
- bit = ftrace_test_recursion_trylock(ip, parent_ip);
- if (bit < 0) {
- fp->nmissed++;
- return;
- }
+static inline bool write_fprobe_header(unsigned long *stack,
+ struct fprobe *fp, unsigned int size_words)
+{
+ struct __fprobe_header *fph = (struct __fprobe_header *)stack;
+
+ if (WARN_ON_ONCE(size_words > MAX_FPROBE_DATA_SIZE_WORD))
+ return false;
+
+ fph->fp = fp;
+ fph->size_words = size_words;
+ return true;
+}
+static inline void read_fprobe_header(unsigned long *stack,
+ struct fprobe **fp, unsigned int *size_words)
+{
+ struct __fprobe_header *fph = (struct __fprobe_header *)stack;
+
+ *fp = fph->fp;
+ *size_words = fph->size_words;
+}
+
+#endif
+
+/*
+ * fprobe shadow stack management:
+ * Since fprobe shares a single fgraph_ops, it needs to share the stack entry
+ * among the probes on the same function exit. Note that a new probe can be
+ * registered before a target function is returning, we can not use the hash
+ * table to find the corresponding probes. Thus the probe address is stored on
+ * the shadow stack with its entry data size.
+ *
+ */
+static inline int __fprobe_handler(unsigned long ip, unsigned long parent_ip,
+ struct fprobe *fp, struct ftrace_regs *fregs,
+ void *data)
+{
+ if (!fp->entry_handler)
+ return 0;
+
+ return fp->entry_handler(fp, ip, parent_ip, fregs, data);
+}
+
+static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
+ struct fprobe *fp, struct ftrace_regs *fregs,
+ void *data)
+{
+ int ret;
/*
* This user handler is shared with other kprobes and is not expected to be
* called recursively. So if any other kprobe handler is running, this will
@@ -108,44 +237,279 @@ static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
*/
if (unlikely(kprobe_running())) {
fp->nmissed++;
- goto recursion_unlock;
+ return 0;
}
kprobe_busy_begin();
- __fprobe_handler(ip, parent_ip, ops, fregs);
+ ret = __fprobe_handler(ip, parent_ip, fp, fregs, data);
kprobe_busy_end();
-
-recursion_unlock:
- ftrace_test_recursion_unlock(bit);
+ return ret;
}
-static void fprobe_exit_handler(struct rethook_node *rh, void *data,
- unsigned long ret_ip, struct pt_regs *regs)
+static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
- struct fprobe *fp = (struct fprobe *)data;
- struct fprobe_rethook_node *fpr;
- int bit;
+ struct fprobe_hlist_node *node, *first;
+ unsigned long *fgraph_data = NULL;
+ unsigned long func = trace->func;
+ unsigned long ret_ip;
+ int reserved_words;
+ struct fprobe *fp;
+ int used, ret;
- if (!fp || fprobe_disabled(fp))
- return;
+ if (WARN_ON_ONCE(!fregs))
+ return 0;
- fpr = container_of(rh, struct fprobe_rethook_node, node);
+ first = node = find_first_fprobe_node(func);
+ if (unlikely(!first))
+ return 0;
+
+ reserved_words = 0;
+ hlist_for_each_entry_from_rcu(node, hlist) {
+ if (node->addr != func)
+ break;
+ fp = READ_ONCE(node->fp);
+ if (!fp || !fp->exit_handler)
+ continue;
+ /*
+ * Since fprobe can be enabled until the next loop, we ignore the
+ * fprobe's disabled flag in this loop.
+ */
+ reserved_words +=
+ FPROBE_HEADER_SIZE_IN_LONG + SIZE_IN_LONG(fp->entry_data_size);
+ }
+ node = first;
+ if (reserved_words) {
+ fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long));
+ if (unlikely(!fgraph_data)) {
+ hlist_for_each_entry_from_rcu(node, hlist) {
+ if (node->addr != func)
+ break;
+ fp = READ_ONCE(node->fp);
+ if (fp && !fprobe_disabled(fp))
+ fp->nmissed++;
+ }
+ return 0;
+ }
+ }
/*
- * we need to assure no calls to traceable functions in-between the
- * end of fprobe_handler and the beginning of fprobe_exit_handler.
+ * TODO: recursion detection has been done in the fgraph. Thus we need
+ * to add a callback to increment missed counter.
*/
- bit = ftrace_test_recursion_trylock(fpr->entry_ip, fpr->entry_parent_ip);
- if (bit < 0) {
- fp->nmissed++;
+ ret_ip = ftrace_regs_get_return_address(fregs);
+ used = 0;
+ hlist_for_each_entry_from_rcu(node, hlist) {
+ int data_size;
+ void *data;
+
+ if (node->addr != func)
+ break;
+ fp = READ_ONCE(node->fp);
+ if (!fp || fprobe_disabled(fp))
+ continue;
+
+ data_size = fp->entry_data_size;
+ if (data_size && fp->exit_handler)
+ data = fgraph_data + used + FPROBE_HEADER_SIZE_IN_LONG;
+ else
+ data = NULL;
+
+ if (fprobe_shared_with_kprobes(fp))
+ ret = __fprobe_kprobe_handler(func, ret_ip, fp, fregs, data);
+ else
+ ret = __fprobe_handler(func, ret_ip, fp, fregs, data);
+
+ /* If entry_handler returns !0, nmissed is not counted but skips exit_handler. */
+ if (!ret && fp->exit_handler) {
+ int size_words = SIZE_IN_LONG(data_size);
+
+ if (write_fprobe_header(&fgraph_data[used], fp, size_words))
+ used += FPROBE_HEADER_SIZE_IN_LONG + size_words;
+ }
+ }
+ if (used < reserved_words)
+ memset(fgraph_data + used, 0, reserved_words - used);
+
+ /* If any exit_handler is set, data must be used. */
+ return used != 0;
+}
+NOKPROBE_SYMBOL(fprobe_entry);
+
+static void fprobe_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
+{
+ unsigned long *fgraph_data = NULL;
+ unsigned long ret_ip;
+ struct fprobe *fp;
+ int size, curr;
+ int size_words;
+
+ fgraph_data = (unsigned long *)fgraph_retrieve_data(gops->idx, &size);
+ if (WARN_ON_ONCE(!fgraph_data))
return;
+ size_words = SIZE_IN_LONG(size);
+ ret_ip = ftrace_regs_get_instruction_pointer(fregs);
+
+ preempt_disable();
+
+ curr = 0;
+ while (size_words > curr) {
+ read_fprobe_header(&fgraph_data[curr], &fp, &size);
+ if (!fp)
+ break;
+ curr += FPROBE_HEADER_SIZE_IN_LONG;
+ if (is_fprobe_still_exist(fp) && !fprobe_disabled(fp)) {
+ if (WARN_ON_ONCE(curr + size > size_words))
+ break;
+ fp->exit_handler(fp, trace->func, ret_ip, fregs,
+ size ? fgraph_data + curr : NULL);
+ }
+ curr += size;
}
+ preempt_enable();
+}
+NOKPROBE_SYMBOL(fprobe_return);
+
+static struct fgraph_ops fprobe_graph_ops = {
+ .entryfunc = fprobe_entry,
+ .retfunc = fprobe_return,
+};
+static int fprobe_graph_active;
+
+/* Add @addrs to the ftrace filter and register fgraph if needed. */
+static int fprobe_graph_add_ips(unsigned long *addrs, int num)
+{
+ int ret;
- fp->exit_handler(fp, fpr->entry_ip, ret_ip, regs,
- fp->entry_data_size ? (void *)fpr->data : NULL);
- ftrace_test_recursion_unlock(bit);
+ lockdep_assert_held(&fprobe_mutex);
+
+ ret = ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 0, 0);
+ if (ret)
+ return ret;
+
+ if (!fprobe_graph_active) {
+ ret = register_ftrace_graph(&fprobe_graph_ops);
+ if (WARN_ON_ONCE(ret)) {
+ ftrace_free_filter(&fprobe_graph_ops.ops);
+ return ret;
+ }
+ }
+ fprobe_graph_active++;
+ return 0;
+}
+
+/* Remove @addrs from the ftrace filter and unregister fgraph if possible. */
+static void fprobe_graph_remove_ips(unsigned long *addrs, int num)
+{
+ lockdep_assert_held(&fprobe_mutex);
+
+ fprobe_graph_active--;
+ /* Q: should we unregister it ? */
+ if (!fprobe_graph_active)
+ unregister_ftrace_graph(&fprobe_graph_ops);
+
+ if (num)
+ ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0);
+}
+
+#ifdef CONFIG_MODULES
+
+#define FPROBE_IPS_BATCH_INIT 8
+/* instruction pointer address list */
+struct fprobe_addr_list {
+ int index;
+ int size;
+ unsigned long *addrs;
+};
+
+static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long addr)
+{
+ unsigned long *addrs;
+
+ if (alist->index >= alist->size)
+ return -ENOMEM;
+
+ alist->addrs[alist->index++] = addr;
+ if (alist->index < alist->size)
+ return 0;
+
+ /* Expand the address list */
+ addrs = kcalloc(alist->size * 2, sizeof(*addrs), GFP_KERNEL);
+ if (!addrs)
+ return -ENOMEM;
+
+ memcpy(addrs, alist->addrs, alist->size * sizeof(*addrs));
+ alist->size *= 2;
+ kfree(alist->addrs);
+ alist->addrs = addrs;
+
+ return 0;
+}
+
+static void fprobe_remove_node_in_module(struct module *mod, struct hlist_head *head,
+ struct fprobe_addr_list *alist)
+{
+ struct fprobe_hlist_node *node;
+ int ret = 0;
+
+ hlist_for_each_entry_rcu(node, head, hlist,
+ lockdep_is_held(&fprobe_mutex)) {
+ if (!within_module(node->addr, mod))
+ continue;
+ if (delete_fprobe_node(node))
+ continue;
+ /*
+ * If failed to update alist, just continue to update hlist.
+ * Therefore, at list user handler will not hit anymore.
+ */
+ if (!ret)
+ ret = fprobe_addr_list_add(alist, node->addr);
+ }
}
-NOKPROBE_SYMBOL(fprobe_exit_handler);
+
+/* Handle module unloading to manage fprobe_ip_table. */
+static int fprobe_module_callback(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct fprobe_addr_list alist = {.size = FPROBE_IPS_BATCH_INIT};
+ struct module *mod = data;
+ int i;
+
+ if (val != MODULE_STATE_GOING)
+ return NOTIFY_DONE;
+
+ alist.addrs = kcalloc(alist.size, sizeof(*alist.addrs), GFP_KERNEL);
+ /* If failed to alloc memory, we can not remove ips from hash. */
+ if (!alist.addrs)
+ return NOTIFY_DONE;
+
+ mutex_lock(&fprobe_mutex);
+ for (i = 0; i < FPROBE_IP_TABLE_SIZE; i++)
+ fprobe_remove_node_in_module(mod, &fprobe_ip_table[i], &alist);
+
+ if (alist.index < alist.size && alist.index > 0)
+ ftrace_set_filter_ips(&fprobe_graph_ops.ops,
+ alist.addrs, alist.index, 1, 0);
+ mutex_unlock(&fprobe_mutex);
+
+ kfree(alist.addrs);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block fprobe_module_nb = {
+ .notifier_call = fprobe_module_callback,
+ .priority = 0,
+};
+
+static int __init init_fprobe_module(void)
+{
+ return register_module_notifier(&fprobe_module_nb);
+}
+early_initcall(init_fprobe_module);
+#endif
static int symbols_cmp(const void *a, const void *b)
{
@@ -175,53 +539,115 @@ static unsigned long *get_ftrace_locations(const char **syms, int num)
return ERR_PTR(-ENOENT);
}
-static void fprobe_init(struct fprobe *fp)
-{
- fp->nmissed = 0;
- if (fprobe_shared_with_kprobes(fp))
- fp->ops.func = fprobe_kprobe_handler;
- else
- fp->ops.func = fprobe_handler;
- fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS;
-}
+struct filter_match_data {
+ const char *filter;
+ const char *notfilter;
+ size_t index;
+ size_t size;
+ unsigned long *addrs;
+ struct module **mods;
+};
-static int fprobe_init_rethook(struct fprobe *fp, int num)
+static int filter_match_callback(void *data, const char *name, unsigned long addr)
{
- int size;
+ struct filter_match_data *match = data;
+
+ if (!glob_match(match->filter, name) ||
+ (match->notfilter && glob_match(match->notfilter, name)))
+ return 0;
- if (!fp->exit_handler) {
- fp->rethook = NULL;
+ if (!ftrace_location(addr))
return 0;
+
+ if (match->addrs) {
+ struct module *mod = __module_text_address(addr);
+
+ if (mod && !try_module_get(mod))
+ return 0;
+
+ match->mods[match->index] = mod;
+ match->addrs[match->index] = addr;
}
+ match->index++;
+ return match->index == match->size;
+}
- /* Initialize rethook if needed */
- if (fp->nr_maxactive)
- num = fp->nr_maxactive;
- else
- num *= num_possible_cpus() * 2;
- if (num <= 0)
- return -EINVAL;
+/*
+ * Make IP list from the filter/no-filter glob patterns.
+ * Return the number of matched symbols, or errno.
+ * If @addrs == NULL, this just counts the number of matched symbols. If @addrs
+ * is passed with an array, we need to pass the an @mods array of the same size
+ * to increment the module refcount for each symbol.
+ * This means we also need to call `module_put` for each element of @mods after
+ * using the @addrs.
+ */
+static int get_ips_from_filter(const char *filter, const char *notfilter,
+ unsigned long *addrs, struct module **mods,
+ size_t size)
+{
+ struct filter_match_data match = { .filter = filter, .notfilter = notfilter,
+ .index = 0, .size = size, .addrs = addrs, .mods = mods};
+ int ret;
- size = sizeof(struct fprobe_rethook_node) + fp->entry_data_size;
+ if (addrs && !mods)
+ return -EINVAL;
- /* Initialize rethook */
- fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler, size, num);
- if (IS_ERR(fp->rethook))
- return PTR_ERR(fp->rethook);
+ ret = kallsyms_on_each_symbol(filter_match_callback, &match);
+ if (ret < 0)
+ return ret;
+ if (IS_ENABLED(CONFIG_MODULES)) {
+ ret = module_kallsyms_on_each_symbol(NULL, filter_match_callback, &match);
+ if (ret < 0)
+ return ret;
+ }
- return 0;
+ return match.index ?: -ENOENT;
}
static void fprobe_fail_cleanup(struct fprobe *fp)
{
- if (!IS_ERR_OR_NULL(fp->rethook)) {
- /* Don't need to cleanup rethook->handler because this is not used. */
- rethook_free(fp->rethook);
- fp->rethook = NULL;
+ kfree(fp->hlist_array);
+ fp->hlist_array = NULL;
+}
+
+/* Initialize the fprobe data structure. */
+static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num)
+{
+ struct fprobe_hlist *hlist_array;
+ unsigned long addr;
+ int size, i;
+
+ if (!fp || !addrs || num <= 0)
+ return -EINVAL;
+
+ size = ALIGN(fp->entry_data_size, sizeof(long));
+ if (size > MAX_FPROBE_DATA_SIZE)
+ return -E2BIG;
+ fp->entry_data_size = size;
+
+ hlist_array = kzalloc(struct_size(hlist_array, array, num), GFP_KERNEL);
+ if (!hlist_array)
+ return -ENOMEM;
+
+ fp->nmissed = 0;
+
+ hlist_array->size = num;
+ fp->hlist_array = hlist_array;
+ hlist_array->fp = fp;
+ for (i = 0; i < num; i++) {
+ hlist_array->array[i].fp = fp;
+ addr = ftrace_location(addrs[i]);
+ if (!addr) {
+ fprobe_fail_cleanup(fp);
+ return -ENOENT;
+ }
+ hlist_array->array[i].addr = addr;
}
- ftrace_free_filter(&fp->ops);
+ return 0;
}
+#define FPROBE_IPS_MAX INT_MAX
+
/**
* register_fprobe() - Register fprobe to ftrace by pattern.
* @fp: A fprobe data structure to be registered.
@@ -235,46 +661,35 @@ static void fprobe_fail_cleanup(struct fprobe *fp)
*/
int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter)
{
- struct ftrace_hash *hash;
- unsigned char *str;
- int ret, len;
+ unsigned long *addrs __free(kfree) = NULL;
+ struct module **mods __free(kfree) = NULL;
+ int ret, num;
if (!fp || !filter)
return -EINVAL;
- fprobe_init(fp);
+ num = get_ips_from_filter(filter, notfilter, NULL, NULL, FPROBE_IPS_MAX);
+ if (num < 0)
+ return num;
- len = strlen(filter);
- str = kstrdup(filter, GFP_KERNEL);
- ret = ftrace_set_filter(&fp->ops, str, len, 0);
- kfree(str);
- if (ret)
- return ret;
+ addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL);
+ if (!addrs)
+ return -ENOMEM;
- if (notfilter) {
- len = strlen(notfilter);
- str = kstrdup(notfilter, GFP_KERNEL);
- ret = ftrace_set_notrace(&fp->ops, str, len, 0);
- kfree(str);
- if (ret)
- goto out;
- }
+ mods = kcalloc(num, sizeof(*mods), GFP_KERNEL);
+ if (!mods)
+ return -ENOMEM;
- /* TODO:
- * correctly calculate the total number of filtered symbols
- * from both filter and notfilter.
- */
- hash = rcu_access_pointer(fp->ops.local_hash.filter_hash);
- if (WARN_ON_ONCE(!hash))
- goto out;
+ ret = get_ips_from_filter(filter, notfilter, addrs, mods, num);
+ if (ret < 0)
+ return ret;
- ret = fprobe_init_rethook(fp, (int)hash->count);
- if (!ret)
- ret = register_ftrace_function(&fp->ops);
+ ret = register_fprobe_ips(fp, addrs, ret);
-out:
- if (ret)
- fprobe_fail_cleanup(fp);
+ for (int i = 0; i < num; i++) {
+ if (mods[i])
+ module_put(mods[i]);
+ }
return ret;
}
EXPORT_SYMBOL_GPL(register_fprobe);
@@ -282,7 +697,7 @@ EXPORT_SYMBOL_GPL(register_fprobe);
/**
* register_fprobe_ips() - Register fprobe to ftrace by address.
* @fp: A fprobe data structure to be registered.
- * @addrs: An array of target ftrace location addresses.
+ * @addrs: An array of target function address.
* @num: The number of entries of @addrs.
*
* Register @fp to ftrace for enabling the probe on the address given by @addrs.
@@ -294,23 +709,27 @@ EXPORT_SYMBOL_GPL(register_fprobe);
*/
int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
{
- int ret;
+ struct fprobe_hlist *hlist_array;
+ int ret, i;
- if (!fp || !addrs || num <= 0)
- return -EINVAL;
-
- fprobe_init(fp);
-
- ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0);
+ ret = fprobe_init(fp, addrs, num);
if (ret)
return ret;
- ret = fprobe_init_rethook(fp, num);
- if (!ret)
- ret = register_ftrace_function(&fp->ops);
+ mutex_lock(&fprobe_mutex);
+
+ hlist_array = fp->hlist_array;
+ ret = fprobe_graph_add_ips(addrs, num);
+ if (!ret) {
+ add_fprobe_hash(fp);
+ for (i = 0; i < hlist_array->size; i++)
+ insert_fprobe_node(&hlist_array->array[i]);
+ }
+ mutex_unlock(&fprobe_mutex);
if (ret)
fprobe_fail_cleanup(fp);
+
return ret;
}
EXPORT_SYMBOL_GPL(register_fprobe_ips);
@@ -348,14 +767,13 @@ EXPORT_SYMBOL_GPL(register_fprobe_syms);
bool fprobe_is_registered(struct fprobe *fp)
{
- if (!fp || (fp->ops.saved_func != fprobe_handler &&
- fp->ops.saved_func != fprobe_kprobe_handler))
+ if (!fp || !fp->hlist_array)
return false;
return true;
}
/**
- * unregister_fprobe() - Unregister fprobe from ftrace
+ * unregister_fprobe() - Unregister fprobe.
* @fp: A fprobe data structure to be unregistered.
*
* Unregister fprobe (and remove ftrace hooks from the function entries).
@@ -364,23 +782,40 @@ bool fprobe_is_registered(struct fprobe *fp)
*/
int unregister_fprobe(struct fprobe *fp)
{
- int ret;
+ struct fprobe_hlist *hlist_array;
+ unsigned long *addrs = NULL;
+ int ret = 0, i, count;
- if (!fprobe_is_registered(fp))
- return -EINVAL;
+ mutex_lock(&fprobe_mutex);
+ if (!fp || !is_fprobe_still_exist(fp)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ hlist_array = fp->hlist_array;
+ addrs = kcalloc(hlist_array->size, sizeof(unsigned long), GFP_KERNEL);
+ if (!addrs) {
+ ret = -ENOMEM; /* TODO: Fallback to one-by-one loop */
+ goto out;
+ }
- if (!IS_ERR_OR_NULL(fp->rethook))
- rethook_stop(fp->rethook);
+ /* Remove non-synonim ips from table and hash */
+ count = 0;
+ for (i = 0; i < hlist_array->size; i++) {
+ if (!delete_fprobe_node(&hlist_array->array[i]))
+ addrs[count++] = hlist_array->array[i].addr;
+ }
+ del_fprobe_hash(fp);
- ret = unregister_ftrace_function(&fp->ops);
- if (ret < 0)
- return ret;
+ fprobe_graph_remove_ips(addrs, count);
- if (!IS_ERR_OR_NULL(fp->rethook))
- rethook_free(fp->rethook);
+ kfree_rcu(hlist_array, rcu);
+ fp->hlist_array = NULL;
- ftrace_free_filter(&fp->ops);
+out:
+ mutex_unlock(&fprobe_mutex);
+ kfree(addrs);
return ret;
}
EXPORT_SYMBOL_GPL(unregister_fprobe);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index da1710499698..1af952cba48d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -74,7 +74,8 @@
#ifdef CONFIG_DYNAMIC_FTRACE
#define INIT_OPS_HASH(opsname) \
.func_hash = &opsname.local_hash, \
- .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
+ .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), \
+ .subop_list = LIST_HEAD_INIT(opsname.subop_list),
#else
#define INIT_OPS_HASH(opsname)
#endif
@@ -99,7 +100,7 @@ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
/* What to set function_trace_op to */
static struct ftrace_ops *set_function_trace_op;
-static bool ftrace_pids_enabled(struct ftrace_ops *ops)
+bool ftrace_pids_enabled(struct ftrace_ops *ops)
{
struct trace_array *tr;
@@ -121,7 +122,7 @@ static int ftrace_disabled __read_mostly;
DEFINE_MUTEX(ftrace_lock);
-struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end;
+struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = (struct ftrace_ops __rcu *)&ftrace_list_end;
ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
struct ftrace_ops global_ops;
@@ -161,12 +162,14 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops)
#ifdef CONFIG_DYNAMIC_FTRACE
if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) {
mutex_init(&ops->local_hash.regex_lock);
+ INIT_LIST_HEAD(&ops->subop_list);
ops->func_hash = &ops->local_hash;
ops->flags |= FTRACE_OPS_FL_INITIALIZED;
}
#endif
}
+/* Call this function for when a callback filters on set_ftrace_pid */
static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs)
{
@@ -235,8 +238,6 @@ static void update_ftrace_function(void)
func = ftrace_ops_list_func;
}
- update_function_graph_func();
-
/* If there's no change, then do nothing more here */
if (ftrace_trace_function == func)
return;
@@ -310,7 +311,7 @@ static int remove_ftrace_ops(struct ftrace_ops __rcu **list,
lockdep_is_held(&ftrace_lock)) == ops &&
rcu_dereference_protected(ops->next,
lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) {
- *list = &ftrace_list_end;
+ rcu_assign_pointer(*list, &ftrace_list_end);
return 0;
}
@@ -406,6 +407,8 @@ static void ftrace_update_pid_func(void)
}
} while_for_each_ftrace_op(op);
+ fgraph_update_pid_func();
+
update_ftrace_function();
}
@@ -533,24 +536,22 @@ static int function_stat_show(struct seq_file *m, void *v)
{
struct ftrace_profile *rec = v;
char str[KSYM_SYMBOL_LEN];
- int ret = 0;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static struct trace_seq s;
unsigned long long avg;
unsigned long long stddev;
+ unsigned long long stddev_denom;
#endif
- mutex_lock(&ftrace_profile_lock);
+ guard(mutex)(&ftrace_profile_lock);
/* we raced with function_profile_reset() */
- if (unlikely(rec->counter == 0)) {
- ret = -EBUSY;
- goto out;
- }
+ if (unlikely(rec->counter == 0))
+ return -EBUSY;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
avg = div64_ul(rec->time, rec->counter);
if (tracing_thresh && (avg < tracing_thresh))
- goto out;
+ return 0;
#endif
kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
@@ -559,23 +560,19 @@ static int function_stat_show(struct seq_file *m, void *v)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
seq_puts(m, " ");
- /* Sample standard deviation (s^2) */
- if (rec->counter <= 1)
- stddev = 0;
- else {
- /*
- * Apply Welford's method:
- * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2)
- */
+ /*
+ * Variance formula:
+ * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2)
+ * Maybe Welford's method is better here?
+ * Divide only by 1000 for ns^2 -> us^2 conversion.
+ * trace_print_graph_duration will divide by 1000 again.
+ */
+ stddev = 0;
+ stddev_denom = rec->counter * (rec->counter - 1) * 1000;
+ if (stddev_denom) {
stddev = rec->counter * rec->time_squared -
rec->time * rec->time;
-
- /*
- * Divide only 1000 for ns^2 -> us^2 conversion.
- * trace_print_graph_duration will divide 1000 again.
- */
- stddev = div64_ul(stddev,
- rec->counter * (rec->counter - 1) * 1000);
+ stddev = div64_ul(stddev, stddev_denom);
}
trace_seq_init(&s);
@@ -587,10 +584,8 @@ static int function_stat_show(struct seq_file *m, void *v)
trace_print_seq(m, &s);
#endif
seq_putc(m, '\n');
-out:
- mutex_unlock(&ftrace_profile_lock);
- return ret;
+ return 0;
}
static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
@@ -786,27 +781,24 @@ function_profile_call(unsigned long ip, unsigned long parent_ip,
{
struct ftrace_profile_stat *stat;
struct ftrace_profile *rec;
- unsigned long flags;
if (!ftrace_profile_enabled)
return;
- local_irq_save(flags);
+ guard(preempt_notrace)();
stat = this_cpu_ptr(&ftrace_profile_stats);
if (!stat->hash || !ftrace_profile_enabled)
- goto out;
+ return;
rec = ftrace_find_profiled_func(stat, ip);
if (!rec) {
rec = ftrace_profile_alloc(stat, ip);
if (!rec)
- goto out;
+ return;
}
rec->counter++;
- out:
- local_irq_restore(flags);
}
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -817,9 +809,17 @@ void ftrace_graph_graph_time_control(bool enable)
fgraph_graph_time = enable;
}
-static int profile_graph_entry(struct ftrace_graph_ent *trace)
+struct profile_fgraph_data {
+ unsigned long long calltime;
+ unsigned long long subtime;
+ unsigned long long sleeptime;
+};
+
+static int profile_graph_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
- struct ftrace_ret_stack *ret_stack;
+ struct profile_fgraph_data *profile_data;
function_profile_call(trace->func, 0, NULL, NULL);
@@ -827,42 +827,57 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace)
if (!current->ret_stack)
return 0;
- ret_stack = ftrace_graph_get_ret_stack(current, 0);
- if (ret_stack)
- ret_stack->subtime = 0;
+ profile_data = fgraph_reserve_data(gops->idx, sizeof(*profile_data));
+ if (!profile_data)
+ return 0;
+
+ profile_data->subtime = 0;
+ profile_data->sleeptime = current->ftrace_sleeptime;
+ profile_data->calltime = trace_clock_local();
return 1;
}
-static void profile_graph_return(struct ftrace_graph_ret *trace)
+static void profile_graph_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
- struct ftrace_ret_stack *ret_stack;
+ struct profile_fgraph_data *profile_data;
struct ftrace_profile_stat *stat;
unsigned long long calltime;
+ unsigned long long rettime = trace_clock_local();
struct ftrace_profile *rec;
- unsigned long flags;
+ int size;
+
+ guard(preempt_notrace)();
- local_irq_save(flags);
stat = this_cpu_ptr(&ftrace_profile_stats);
if (!stat->hash || !ftrace_profile_enabled)
- goto out;
+ return;
+
+ profile_data = fgraph_retrieve_data(gops->idx, &size);
/* If the calltime was zero'd ignore it */
- if (!trace->calltime)
- goto out;
+ if (!profile_data || !profile_data->calltime)
+ return;
- calltime = trace->rettime - trace->calltime;
+ calltime = rettime - profile_data->calltime;
+
+ if (!fgraph_sleep_time) {
+ if (current->ftrace_sleeptime)
+ calltime -= current->ftrace_sleeptime - profile_data->sleeptime;
+ }
if (!fgraph_graph_time) {
+ struct profile_fgraph_data *parent_data;
/* Append this call time to the parent time to subtract */
- ret_stack = ftrace_graph_get_ret_stack(current, 1);
- if (ret_stack)
- ret_stack->subtime += calltime;
+ parent_data = fgraph_retrieve_parent_data(gops->idx, &size, 1);
+ if (parent_data)
+ parent_data->subtime += calltime;
- ret_stack = ftrace_graph_get_ret_stack(current, 0);
- if (ret_stack && ret_stack->subtime < calltime)
- calltime -= ret_stack->subtime;
+ if (profile_data->subtime && profile_data->subtime < calltime)
+ calltime -= profile_data->subtime;
else
calltime = 0;
}
@@ -872,9 +887,6 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
rec->time += calltime;
rec->time_squared += calltime * calltime;
}
-
- out:
- local_irq_restore(flags);
}
static struct fgraph_ops fprofiler_ops = {
@@ -884,6 +896,7 @@ static struct fgraph_ops fprofiler_ops = {
static int register_ftrace_profiler(void)
{
+ ftrace_ops_set_global_filter(&fprofiler_ops.ops);
return register_ftrace_graph(&fprofiler_ops);
}
@@ -894,12 +907,11 @@ static void unregister_ftrace_profiler(void)
#else
static struct ftrace_ops ftrace_profile_ops __read_mostly = {
.func = function_profile_call,
- .flags = FTRACE_OPS_FL_INITIALIZED,
- INIT_OPS_HASH(ftrace_profile_ops)
};
static int register_ftrace_profiler(void)
{
+ ftrace_ops_set_global_filter(&ftrace_profile_ops);
return register_ftrace_function(&ftrace_profile_ops);
}
@@ -922,20 +934,16 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
val = !!val;
- mutex_lock(&ftrace_profile_lock);
+ guard(mutex)(&ftrace_profile_lock);
if (ftrace_profile_enabled ^ val) {
if (val) {
ret = ftrace_profile_init();
- if (ret < 0) {
- cnt = ret;
- goto out;
- }
+ if (ret < 0)
+ return ret;
ret = register_ftrace_profiler();
- if (ret < 0) {
- cnt = ret;
- goto out;
- }
+ if (ret < 0)
+ return ret;
ftrace_profile_enabled = 1;
} else {
ftrace_profile_enabled = 0;
@@ -946,8 +954,6 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
unregister_ftrace_profiler();
}
}
- out:
- mutex_unlock(&ftrace_profile_lock);
*ppos += cnt;
@@ -1287,8 +1293,12 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
void ftrace_free_filter(struct ftrace_ops *ops)
{
ftrace_ops_init(ops);
+ if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return;
free_ftrace_hash(ops->func_hash->filter_hash);
free_ftrace_hash(ops->func_hash->notrace_hash);
+ ops->func_hash->filter_hash = EMPTY_HASH;
+ ops->func_hash->notrace_hash = EMPTY_HASH;
}
EXPORT_SYMBOL_GPL(ftrace_free_filter);
@@ -1314,7 +1324,7 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
return hash;
}
-
+/* Used to save filters on functions for modules not loaded yet */
static int ftrace_add_mod(struct trace_array *tr,
const char *func, const char *module,
int enable)
@@ -1380,15 +1390,17 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
return NULL;
}
-static void
-ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash);
-static void
-ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
+static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops);
+static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops);
static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
struct ftrace_hash *new_hash);
-static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size)
+/*
+ * Allocate a new hash and remove entries from @src and move them to the new hash.
+ * On success, the @src hash will be empty and should be freed.
+ */
+static struct ftrace_hash *__move_hash(struct ftrace_hash *src, int size)
{
struct ftrace_func_entry *entry;
struct ftrace_hash *new_hash;
@@ -1424,6 +1436,7 @@ static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size)
return new_hash;
}
+/* Move the @src entries to a newly allocated hash */
static struct ftrace_hash *
__ftrace_hash_move(struct ftrace_hash *src)
{
@@ -1435,9 +1448,29 @@ __ftrace_hash_move(struct ftrace_hash *src)
if (ftrace_hash_empty(src))
return EMPTY_HASH;
- return dup_hash(src, size);
+ return __move_hash(src, size);
}
+/**
+ * ftrace_hash_move - move a new hash to a filter and do updates
+ * @ops: The ops with the hash that @dst points to
+ * @enable: True if for the filter hash, false for the notrace hash
+ * @dst: Points to the @ops hash that should be updated
+ * @src: The hash to update @dst with
+ *
+ * This is called when an ftrace_ops hash is being updated and the
+ * the kernel needs to reflect this. Note, this only updates the kernel
+ * function callbacks if the @ops is enabled (not to be confused with
+ * @enable above). If the @ops is enabled, its hash determines what
+ * callbacks get called. This function gets called when the @ops hash
+ * is updated and it requires new callbacks.
+ *
+ * On success the elements of @src is moved to @dst, and @dst is updated
+ * properly, as well as the functions determined by the @ops hashes
+ * are now calling the @ops callback function.
+ *
+ * Regardless of return type, @src should be freed with free_ftrace_hash().
+ */
static int
ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct ftrace_hash **dst, struct ftrace_hash *src)
@@ -1467,11 +1500,11 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
* Remove the current set, update the hash and add
* them back.
*/
- ftrace_hash_rec_disable_modify(ops, enable);
+ ftrace_hash_rec_disable_modify(ops);
rcu_assign_pointer(*dst, new_hash);
- ftrace_hash_rec_enable_modify(ops, enable);
+ ftrace_hash_rec_enable_modify(ops);
return 0;
}
@@ -1595,12 +1628,15 @@ static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end)
unsigned long ftrace_location_range(unsigned long start, unsigned long end)
{
struct dyn_ftrace *rec;
+ unsigned long ip = 0;
+ rcu_read_lock();
rec = lookup_rec(start, end);
if (rec)
- return rec->ip;
+ ip = rec->ip;
+ rcu_read_unlock();
- return 0;
+ return ip;
}
/**
@@ -1614,25 +1650,20 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end)
*/
unsigned long ftrace_location(unsigned long ip)
{
- struct dyn_ftrace *rec;
+ unsigned long loc;
unsigned long offset;
unsigned long size;
- rec = lookup_rec(ip, ip);
- if (!rec) {
+ loc = ftrace_location_range(ip, ip);
+ if (!loc) {
if (!kallsyms_lookup_size_offset(ip, &size, &offset))
- goto out;
+ return 0;
/* map sym+0 to __fentry__ */
if (!offset)
- rec = lookup_rec(ip, ip + size - 1);
+ loc = ftrace_location_range(ip, ip + size - 1);
}
-
- if (rec)
- return rec->ip;
-
-out:
- return 0;
+ return loc;
}
/**
@@ -1694,12 +1725,21 @@ static bool skip_record(struct dyn_ftrace *rec)
!(rec->flags & FTRACE_FL_ENABLED);
}
+/*
+ * This is the main engine to the ftrace updates to the dyn_ftrace records.
+ *
+ * It will iterate through all the available ftrace functions
+ * (the ones that ftrace can have callbacks to) and set the flags
+ * in the associated dyn_ftrace records.
+ *
+ * @inc: If true, the functions associated to @ops are added to
+ * the dyn_ftrace records, otherwise they are removed.
+ */
static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
- int filter_hash,
bool inc)
{
struct ftrace_hash *hash;
- struct ftrace_hash *other_hash;
+ struct ftrace_hash *notrace_hash;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
bool update = false;
@@ -1711,35 +1751,16 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
return false;
/*
- * In the filter_hash case:
* If the count is zero, we update all records.
* Otherwise we just update the items in the hash.
- *
- * In the notrace_hash case:
- * We enable the update in the hash.
- * As disabling notrace means enabling the tracing,
- * and enabling notrace means disabling, the inc variable
- * gets inversed.
*/
- if (filter_hash) {
- hash = ops->func_hash->filter_hash;
- other_hash = ops->func_hash->notrace_hash;
- if (ftrace_hash_empty(hash))
- all = true;
- } else {
- inc = !inc;
- hash = ops->func_hash->notrace_hash;
- other_hash = ops->func_hash->filter_hash;
- /*
- * If the notrace hash has no items,
- * then there's nothing to do.
- */
- if (ftrace_hash_empty(hash))
- return false;
- }
+ hash = ops->func_hash->filter_hash;
+ notrace_hash = ops->func_hash->notrace_hash;
+ if (ftrace_hash_empty(hash))
+ all = true;
do_for_each_ftrace_rec(pg, rec) {
- int in_other_hash = 0;
+ int in_notrace_hash = 0;
int in_hash = 0;
int match = 0;
@@ -1751,26 +1772,17 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
* Only the filter_hash affects all records.
* Update if the record is not in the notrace hash.
*/
- if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
+ if (!notrace_hash || !ftrace_lookup_ip(notrace_hash, rec->ip))
match = 1;
} else {
in_hash = !!ftrace_lookup_ip(hash, rec->ip);
- in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);
+ in_notrace_hash = !!ftrace_lookup_ip(notrace_hash, rec->ip);
/*
- * If filter_hash is set, we want to match all functions
- * that are in the hash but not in the other hash.
- *
- * If filter_hash is not set, then we are decrementing.
- * That means we match anything that is in the hash
- * and also in the other_hash. That is, we need to turn
- * off functions in the other hash because they are disabled
- * by this hash.
+ * We want to match all functions that are in the hash but
+ * not in the other hash.
*/
- if (filter_hash && in_hash && !in_other_hash)
- match = 1;
- else if (!filter_hash && in_hash &&
- (in_other_hash || ftrace_hash_empty(other_hash)))
+ if (in_hash && !in_notrace_hash)
match = 1;
}
if (!match)
@@ -1876,24 +1888,48 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
return update;
}
-static bool ftrace_hash_rec_disable(struct ftrace_ops *ops,
- int filter_hash)
+/*
+ * This is called when an ops is removed from tracing. It will decrement
+ * the counters of the dyn_ftrace records for all the functions that
+ * the @ops attached to.
+ */
+static bool ftrace_hash_rec_disable(struct ftrace_ops *ops)
{
- return __ftrace_hash_rec_update(ops, filter_hash, 0);
+ return __ftrace_hash_rec_update(ops, false);
}
-static bool ftrace_hash_rec_enable(struct ftrace_ops *ops,
- int filter_hash)
+/*
+ * This is called when an ops is added to tracing. It will increment
+ * the counters of the dyn_ftrace records for all the functions that
+ * the @ops attached to.
+ */
+static bool ftrace_hash_rec_enable(struct ftrace_ops *ops)
{
- return __ftrace_hash_rec_update(ops, filter_hash, 1);
+ return __ftrace_hash_rec_update(ops, true);
}
-static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops,
- int filter_hash, int inc)
+/*
+ * This function will update what functions @ops traces when its filter
+ * changes.
+ *
+ * The @inc states if the @ops callbacks are going to be added or removed.
+ * When one of the @ops hashes are updated to a "new_hash" the dyn_ftrace
+ * records are update via:
+ *
+ * ftrace_hash_rec_disable_modify(ops);
+ * ops->hash = new_hash
+ * ftrace_hash_rec_enable_modify(ops);
+ *
+ * Where the @ops is removed from all the records it is tracing using
+ * its old hash. The @ops hash is updated to the new hash, and then
+ * the @ops is added back to the records so that it is tracing all
+ * the new functions.
+ */
+static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops, bool inc)
{
struct ftrace_ops *op;
- __ftrace_hash_rec_update(ops, filter_hash, inc);
+ __ftrace_hash_rec_update(ops, inc);
if (ops->func_hash != &global_ops.local_hash)
return;
@@ -1907,20 +1943,18 @@ static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops,
if (op == ops)
continue;
if (op->func_hash == &global_ops.local_hash)
- __ftrace_hash_rec_update(op, filter_hash, inc);
+ __ftrace_hash_rec_update(op, inc);
} while_for_each_ftrace_op(op);
}
-static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops,
- int filter_hash)
+static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops)
{
- ftrace_hash_rec_update_modify(ops, filter_hash, 0);
+ ftrace_hash_rec_update_modify(ops, false);
}
-static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
- int filter_hash)
+static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops)
{
- ftrace_hash_rec_update_modify(ops, filter_hash, 1);
+ ftrace_hash_rec_update_modify(ops, true);
}
/*
@@ -2023,7 +2057,7 @@ rollback:
continue;
if (rec == end)
- goto err_out;
+ return -EBUSY;
in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
@@ -2036,7 +2070,6 @@ rollback:
rec->flags |= FTRACE_FL_IPMODIFY;
} while_for_each_ftrace_rec();
-err_out:
return -EBUSY;
}
@@ -2538,7 +2571,6 @@ ftrace_find_unique_ops(struct dyn_ftrace *rec)
/* Protected by rcu_tasks for reading, and direct_mutex for writing */
static struct ftrace_hash __rcu *direct_functions = EMPTY_HASH;
static DEFINE_MUTEX(direct_mutex);
-int ftrace_direct_func_count;
/*
* Search the direct_functions hash to see if the given instruction pointer
@@ -3044,7 +3076,7 @@ int ftrace_startup(struct ftrace_ops *ops, int command)
return ret;
}
- if (ftrace_hash_rec_enable(ops, 1))
+ if (ftrace_hash_rec_enable(ops))
command |= FTRACE_UPDATE_CALLS;
ftrace_startup_enable(command);
@@ -3086,7 +3118,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
/* Disabling ipmodify never fails */
ftrace_hash_ipmodify_disable(ops);
- if (ftrace_hash_rec_disable(ops, 1))
+ if (ftrace_hash_rec_disable(ops))
command |= FTRACE_UPDATE_CALLS;
ops->flags &= ~FTRACE_OPS_FL_ENABLED;
@@ -3157,8 +3189,7 @@ out:
* synchronize_rcu_tasks() will wait for those tasks to
* execute and either schedule voluntarily or enter user space.
*/
- if (IS_ENABLED(CONFIG_PREEMPTION))
- synchronize_rcu_tasks();
+ synchronize_rcu_tasks();
ftrace_trampoline_free(ops);
}
@@ -3166,7 +3197,535 @@ out:
return 0;
}
-static u64 ftrace_update_time;
+/* Simply make a copy of @src and return it */
+static struct ftrace_hash *copy_hash(struct ftrace_hash *src)
+{
+ if (ftrace_hash_empty(src))
+ return EMPTY_HASH;
+
+ return alloc_and_copy_ftrace_hash(src->size_bits, src);
+}
+
+/*
+ * Append @new_hash entries to @hash:
+ *
+ * If @hash is the EMPTY_HASH then it traces all functions and nothing
+ * needs to be done.
+ *
+ * If @new_hash is the EMPTY_HASH, then make *hash the EMPTY_HASH so
+ * that it traces everything.
+ *
+ * Otherwise, go through all of @new_hash and add anything that @hash
+ * doesn't already have, to @hash.
+ *
+ * The filter_hash updates uses just the append_hash() function
+ * and the notrace_hash does not.
+ */
+static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash,
+ int size_bits)
+{
+ struct ftrace_func_entry *entry;
+ int size;
+ int i;
+
+ if (*hash) {
+ /* An empty hash does everything */
+ if (ftrace_hash_empty(*hash))
+ return 0;
+ } else {
+ *hash = alloc_ftrace_hash(size_bits);
+ if (!*hash)
+ return -ENOMEM;
+ }
+
+ /* If new_hash has everything make hash have everything */
+ if (ftrace_hash_empty(new_hash)) {
+ free_ftrace_hash(*hash);
+ *hash = EMPTY_HASH;
+ return 0;
+ }
+
+ size = 1 << new_hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &new_hash->buckets[i], hlist) {
+ /* Only add if not already in hash */
+ if (!__ftrace_lookup_ip(*hash, entry->ip) &&
+ add_hash_entry(*hash, entry->ip) == NULL)
+ return -ENOMEM;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Remove functions from @hash that are in @notrace_hash
+ */
+static void remove_hash(struct ftrace_hash *hash, struct ftrace_hash *notrace_hash)
+{
+ struct ftrace_func_entry *entry;
+ struct hlist_node *tmp;
+ int size;
+ int i;
+
+ /* If the notrace hash is empty, there's nothing to do */
+ if (ftrace_hash_empty(notrace_hash))
+ return;
+
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry_safe(entry, tmp, &hash->buckets[i], hlist) {
+ if (!__ftrace_lookup_ip(notrace_hash, entry->ip))
+ continue;
+ remove_hash_entry(hash, entry);
+ kfree(entry);
+ }
+ }
+}
+
+/*
+ * Add to @hash only those that are in both @new_hash1 and @new_hash2
+ *
+ * The notrace_hash updates uses just the intersect_hash() function
+ * and the filter_hash does not.
+ */
+static int intersect_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash1,
+ struct ftrace_hash *new_hash2)
+{
+ struct ftrace_func_entry *entry;
+ int size;
+ int i;
+
+ /*
+ * If new_hash1 or new_hash2 is the EMPTY_HASH then make the hash
+ * empty as well as empty for notrace means none are notraced.
+ */
+ if (ftrace_hash_empty(new_hash1) || ftrace_hash_empty(new_hash2)) {
+ free_ftrace_hash(*hash);
+ *hash = EMPTY_HASH;
+ return 0;
+ }
+
+ size = 1 << new_hash1->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &new_hash1->buckets[i], hlist) {
+ /* Only add if in both @new_hash1 and @new_hash2 */
+ if (__ftrace_lookup_ip(new_hash2, entry->ip) &&
+ add_hash_entry(*hash, entry->ip) == NULL)
+ return -ENOMEM;
+ }
+ }
+ /* If nothing intersects, make it the empty set */
+ if (ftrace_hash_empty(*hash)) {
+ free_ftrace_hash(*hash);
+ *hash = EMPTY_HASH;
+ }
+ return 0;
+}
+
+static bool ops_equal(struct ftrace_hash *A, struct ftrace_hash *B)
+{
+ struct ftrace_func_entry *entry;
+ int size;
+ int i;
+
+ if (ftrace_hash_empty(A))
+ return ftrace_hash_empty(B);
+
+ if (ftrace_hash_empty(B))
+ return ftrace_hash_empty(A);
+
+ if (A->count != B->count)
+ return false;
+
+ size = 1 << A->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &A->buckets[i], hlist) {
+ if (!__ftrace_lookup_ip(B, entry->ip))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static void ftrace_ops_update_code(struct ftrace_ops *ops,
+ struct ftrace_ops_hash *old_hash);
+
+static int __ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
+ struct ftrace_hash **orig_hash,
+ struct ftrace_hash *hash,
+ int enable)
+{
+ struct ftrace_ops_hash old_hash_ops;
+ struct ftrace_hash *old_hash;
+ int ret;
+
+ old_hash = *orig_hash;
+ old_hash_ops.filter_hash = ops->func_hash->filter_hash;
+ old_hash_ops.notrace_hash = ops->func_hash->notrace_hash;
+ ret = ftrace_hash_move(ops, enable, orig_hash, hash);
+ if (!ret) {
+ ftrace_ops_update_code(ops, &old_hash_ops);
+ free_ftrace_hash_rcu(old_hash);
+ }
+ return ret;
+}
+
+static int ftrace_update_ops(struct ftrace_ops *ops, struct ftrace_hash *filter_hash,
+ struct ftrace_hash *notrace_hash)
+{
+ int ret;
+
+ if (!ops_equal(filter_hash, ops->func_hash->filter_hash)) {
+ ret = __ftrace_hash_move_and_update_ops(ops, &ops->func_hash->filter_hash,
+ filter_hash, 1);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (!ops_equal(notrace_hash, ops->func_hash->notrace_hash)) {
+ ret = __ftrace_hash_move_and_update_ops(ops, &ops->func_hash->notrace_hash,
+ notrace_hash, 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int add_first_hash(struct ftrace_hash **filter_hash, struct ftrace_hash **notrace_hash,
+ struct ftrace_ops_hash *func_hash)
+{
+ /* If the filter hash is not empty, simply remove the nohash from it */
+ if (!ftrace_hash_empty(func_hash->filter_hash)) {
+ *filter_hash = copy_hash(func_hash->filter_hash);
+ if (!*filter_hash)
+ return -ENOMEM;
+ remove_hash(*filter_hash, func_hash->notrace_hash);
+ *notrace_hash = EMPTY_HASH;
+
+ } else {
+ *notrace_hash = copy_hash(func_hash->notrace_hash);
+ if (!*notrace_hash)
+ return -ENOMEM;
+ *filter_hash = EMPTY_HASH;
+ }
+ return 0;
+}
+
+static int add_next_hash(struct ftrace_hash **filter_hash, struct ftrace_hash **notrace_hash,
+ struct ftrace_ops_hash *ops_hash, struct ftrace_ops_hash *subops_hash)
+{
+ int size_bits;
+ int ret;
+
+ /* If the subops trace all functions so must the main ops */
+ if (ftrace_hash_empty(ops_hash->filter_hash) ||
+ ftrace_hash_empty(subops_hash->filter_hash)) {
+ *filter_hash = EMPTY_HASH;
+ } else {
+ /*
+ * The main ops filter hash is not empty, so its
+ * notrace_hash had better be, as the notrace hash
+ * is only used for empty main filter hashes.
+ */
+ WARN_ON_ONCE(!ftrace_hash_empty(ops_hash->notrace_hash));
+
+ size_bits = max(ops_hash->filter_hash->size_bits,
+ subops_hash->filter_hash->size_bits);
+
+ /* Copy the subops hash */
+ *filter_hash = alloc_and_copy_ftrace_hash(size_bits, subops_hash->filter_hash);
+ if (!*filter_hash)
+ return -ENOMEM;
+ /* Remove any notrace functions from the copy */
+ remove_hash(*filter_hash, subops_hash->notrace_hash);
+
+ ret = append_hash(filter_hash, ops_hash->filter_hash,
+ size_bits);
+ if (ret < 0) {
+ free_ftrace_hash(*filter_hash);
+ *filter_hash = EMPTY_HASH;
+ return ret;
+ }
+ }
+
+ /*
+ * Only process notrace hashes if the main filter hash is empty
+ * (tracing all functions), otherwise the filter hash will just
+ * remove the notrace hash functions, and the notrace hash is
+ * not needed.
+ */
+ if (ftrace_hash_empty(*filter_hash)) {
+ /*
+ * Intersect the notrace functions. That is, if two
+ * subops are not tracing a set of functions, the
+ * main ops will only not trace the functions that are
+ * in both subops, but has to trace the functions that
+ * are only notrace in one of the subops, for the other
+ * subops to be able to trace them.
+ */
+ size_bits = max(ops_hash->notrace_hash->size_bits,
+ subops_hash->notrace_hash->size_bits);
+ *notrace_hash = alloc_ftrace_hash(size_bits);
+ if (!*notrace_hash)
+ return -ENOMEM;
+
+ ret = intersect_hash(notrace_hash, ops_hash->notrace_hash,
+ subops_hash->notrace_hash);
+ if (ret < 0) {
+ free_ftrace_hash(*notrace_hash);
+ *notrace_hash = EMPTY_HASH;
+ return ret;
+ }
+ }
+ return 0;
+}
+
+/**
+ * ftrace_startup_subops - enable tracing for subops of an ops
+ * @ops: Manager ops (used to pick all the functions of its subops)
+ * @subops: A new ops to add to @ops
+ * @command: Extra commands to use to enable tracing
+ *
+ * The @ops is a manager @ops that has the filter that includes all the functions
+ * that its list of subops are tracing. Adding a new @subops will add the
+ * functions of @subops to @ops.
+ */
+int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command)
+{
+ struct ftrace_hash *filter_hash = EMPTY_HASH;
+ struct ftrace_hash *notrace_hash = EMPTY_HASH;
+ struct ftrace_hash *save_filter_hash;
+ struct ftrace_hash *save_notrace_hash;
+ int ret;
+
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ ftrace_ops_init(ops);
+ ftrace_ops_init(subops);
+
+ if (WARN_ON_ONCE(subops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EBUSY;
+
+ /* Make everything canonical (Just in case!) */
+ if (!ops->func_hash->filter_hash)
+ ops->func_hash->filter_hash = EMPTY_HASH;
+ if (!ops->func_hash->notrace_hash)
+ ops->func_hash->notrace_hash = EMPTY_HASH;
+ if (!subops->func_hash->filter_hash)
+ subops->func_hash->filter_hash = EMPTY_HASH;
+ if (!subops->func_hash->notrace_hash)
+ subops->func_hash->notrace_hash = EMPTY_HASH;
+
+ /* For the first subops to ops just enable it normally */
+ if (list_empty(&ops->subop_list)) {
+
+ /* The ops was empty, should have empty hashes */
+ WARN_ON_ONCE(!ftrace_hash_empty(ops->func_hash->filter_hash));
+ WARN_ON_ONCE(!ftrace_hash_empty(ops->func_hash->notrace_hash));
+
+ ret = add_first_hash(&filter_hash, &notrace_hash, subops->func_hash);
+ if (ret < 0)
+ return ret;
+
+ save_filter_hash = ops->func_hash->filter_hash;
+ save_notrace_hash = ops->func_hash->notrace_hash;
+
+ ops->func_hash->filter_hash = filter_hash;
+ ops->func_hash->notrace_hash = notrace_hash;
+ list_add(&subops->list, &ops->subop_list);
+ ret = ftrace_startup(ops, command);
+ if (ret < 0) {
+ list_del(&subops->list);
+ ops->func_hash->filter_hash = save_filter_hash;
+ ops->func_hash->notrace_hash = save_notrace_hash;
+ free_ftrace_hash(filter_hash);
+ free_ftrace_hash(notrace_hash);
+ } else {
+ free_ftrace_hash(save_filter_hash);
+ free_ftrace_hash(save_notrace_hash);
+ subops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_SUBOP;
+ subops->managed = ops;
+ }
+ return ret;
+ }
+
+ /*
+ * Here there's already something attached. Here are the rules:
+ * If the new subops and main ops filter hashes are not empty:
+ * o Make a copy of the subops filter hash
+ * o Remove all functions in the nohash from it.
+ * o Add in the main hash filter functions
+ * o Remove any of these functions from the main notrace hash
+ */
+
+ ret = add_next_hash(&filter_hash, &notrace_hash, ops->func_hash, subops->func_hash);
+ if (ret < 0)
+ return ret;
+
+ list_add(&subops->list, &ops->subop_list);
+
+ ret = ftrace_update_ops(ops, filter_hash, notrace_hash);
+ free_ftrace_hash(filter_hash);
+ free_ftrace_hash(notrace_hash);
+ if (ret < 0) {
+ list_del(&subops->list);
+ } else {
+ subops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_SUBOP;
+ subops->managed = ops;
+ }
+ return ret;
+}
+
+static int rebuild_hashes(struct ftrace_hash **filter_hash, struct ftrace_hash **notrace_hash,
+ struct ftrace_ops *ops)
+{
+ struct ftrace_ops_hash temp_hash;
+ struct ftrace_ops *subops;
+ bool first = true;
+ int ret;
+
+ temp_hash.filter_hash = EMPTY_HASH;
+ temp_hash.notrace_hash = EMPTY_HASH;
+
+ list_for_each_entry(subops, &ops->subop_list, list) {
+ *filter_hash = EMPTY_HASH;
+ *notrace_hash = EMPTY_HASH;
+
+ if (first) {
+ ret = add_first_hash(filter_hash, notrace_hash, subops->func_hash);
+ if (ret < 0)
+ return ret;
+ first = false;
+ } else {
+ ret = add_next_hash(filter_hash, notrace_hash,
+ &temp_hash, subops->func_hash);
+ if (ret < 0) {
+ free_ftrace_hash(temp_hash.filter_hash);
+ free_ftrace_hash(temp_hash.notrace_hash);
+ return ret;
+ }
+ }
+
+ free_ftrace_hash(temp_hash.filter_hash);
+ free_ftrace_hash(temp_hash.notrace_hash);
+
+ temp_hash.filter_hash = *filter_hash;
+ temp_hash.notrace_hash = *notrace_hash;
+ }
+ return 0;
+}
+
+/**
+ * ftrace_shutdown_subops - Remove a subops from a manager ops
+ * @ops: A manager ops to remove @subops from
+ * @subops: The subops to remove from @ops
+ * @command: Any extra command flags to add to modifying the text
+ *
+ * Removes the functions being traced by the @subops from @ops. Note, it
+ * will not affect functions that are being traced by other subops that
+ * still exist in @ops.
+ *
+ * If the last subops is removed from @ops, then @ops is shutdown normally.
+ */
+int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command)
+{
+ struct ftrace_hash *filter_hash = EMPTY_HASH;
+ struct ftrace_hash *notrace_hash = EMPTY_HASH;
+ int ret;
+
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ if (WARN_ON_ONCE(!(subops->flags & FTRACE_OPS_FL_ENABLED)))
+ return -EINVAL;
+
+ list_del(&subops->list);
+
+ if (list_empty(&ops->subop_list)) {
+ /* Last one, just disable the current ops */
+
+ ret = ftrace_shutdown(ops, command);
+ if (ret < 0) {
+ list_add(&subops->list, &ops->subop_list);
+ return ret;
+ }
+
+ subops->flags &= ~FTRACE_OPS_FL_ENABLED;
+
+ free_ftrace_hash(ops->func_hash->filter_hash);
+ free_ftrace_hash(ops->func_hash->notrace_hash);
+ ops->func_hash->filter_hash = EMPTY_HASH;
+ ops->func_hash->notrace_hash = EMPTY_HASH;
+ subops->flags &= ~(FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_SUBOP);
+ subops->managed = NULL;
+
+ return 0;
+ }
+
+ /* Rebuild the hashes without subops */
+ ret = rebuild_hashes(&filter_hash, &notrace_hash, ops);
+ if (ret < 0)
+ return ret;
+
+ ret = ftrace_update_ops(ops, filter_hash, notrace_hash);
+ if (ret < 0) {
+ list_add(&subops->list, &ops->subop_list);
+ } else {
+ subops->flags &= ~(FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_SUBOP);
+ subops->managed = NULL;
+ }
+ free_ftrace_hash(filter_hash);
+ free_ftrace_hash(notrace_hash);
+ return ret;
+}
+
+static int ftrace_hash_move_and_update_subops(struct ftrace_ops *subops,
+ struct ftrace_hash **orig_subhash,
+ struct ftrace_hash *hash)
+{
+ struct ftrace_ops *ops = subops->managed;
+ struct ftrace_hash *notrace_hash;
+ struct ftrace_hash *filter_hash;
+ struct ftrace_hash *save_hash;
+ struct ftrace_hash *new_hash;
+ int ret;
+
+ /* Manager ops can not be subops (yet) */
+ if (WARN_ON_ONCE(!ops || ops->flags & FTRACE_OPS_FL_SUBOP))
+ return -EINVAL;
+
+ /* Move the new hash over to the subops hash */
+ save_hash = *orig_subhash;
+ *orig_subhash = __ftrace_hash_move(hash);
+ if (!*orig_subhash) {
+ *orig_subhash = save_hash;
+ return -ENOMEM;
+ }
+
+ ret = rebuild_hashes(&filter_hash, &notrace_hash, ops);
+ if (!ret) {
+ ret = ftrace_update_ops(ops, filter_hash, notrace_hash);
+ free_ftrace_hash(filter_hash);
+ free_ftrace_hash(notrace_hash);
+ }
+
+ if (ret) {
+ /* Put back the original hash */
+ new_hash = *orig_subhash;
+ *orig_subhash = save_hash;
+ free_ftrace_hash_rcu(new_hash);
+ } else {
+ free_ftrace_hash_rcu(save_hash);
+ }
+ return ret;
+}
+
+
+u64 ftrace_update_time;
+u64 ftrace_total_mod_time;
unsigned long ftrace_update_tot_cnt;
unsigned long ftrace_number_of_pages;
unsigned long ftrace_number_of_groups;
@@ -3186,7 +3745,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
bool init_nop = ftrace_need_init_nop();
struct ftrace_page *pg;
struct dyn_ftrace *p;
- u64 start, stop;
+ u64 start, stop, update_time;
unsigned long update_cnt = 0;
unsigned long rec_flags = 0;
int i;
@@ -3230,7 +3789,11 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
}
stop = ftrace_now(raw_smp_processor_id());
- ftrace_update_time = stop - start;
+ update_time = stop - start;
+ if (mod)
+ ftrace_total_mod_time += update_time;
+ else
+ ftrace_update_time = update_time;
ftrace_update_tot_cnt += update_cnt;
return 0;
@@ -3810,6 +4373,42 @@ static inline int print_rec(struct seq_file *m, unsigned long ip)
}
#endif
+static void print_subops(struct seq_file *m, struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *subops;
+ bool first = true;
+
+ list_for_each_entry(subops, &ops->subop_list, list) {
+ if (!((subops->flags & FTRACE_OPS_FL_ENABLED) &&
+ hash_contains_ip(rec->ip, subops->func_hash)))
+ continue;
+ if (first) {
+ seq_printf(m, "\tsubops:");
+ first = false;
+ }
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ if (subops->flags & FTRACE_OPS_FL_GRAPH) {
+ struct fgraph_ops *gops;
+
+ gops = container_of(subops, struct fgraph_ops, ops);
+ seq_printf(m, " {ent:%pS ret:%pS}",
+ (void *)gops->entryfunc,
+ (void *)gops->retfunc);
+ continue;
+ }
+#endif
+ if (subops->trampoline) {
+ seq_printf(m, " {%pS (%pS)}",
+ (void *)subops->trampoline,
+ (void *)subops->func);
+ add_trampoline_func(m, subops, rec);
+ } else {
+ seq_printf(m, " {%pS}",
+ (void *)subops->func);
+ }
+ }
+}
+
static int t_show(struct seq_file *m, void *v)
{
struct ftrace_iterator *iter = m->private;
@@ -3862,6 +4461,7 @@ static int t_show(struct seq_file *m, void *v)
(void *)ops->trampoline,
(void *)ops->func);
add_trampoline_func(m, ops, rec);
+ print_subops(m, ops, rec);
ops = ftrace_find_tramp_ops_next(rec, ops);
} while (ops);
} else
@@ -3874,6 +4474,7 @@ static int t_show(struct seq_file *m, void *v)
if (ops) {
seq_printf(m, "\tops: %pS (%pS)",
ops, ops->func);
+ print_subops(m, ops, rec);
} else {
seq_puts(m, "\tops: ERROR!");
}
@@ -4202,12 +4803,12 @@ static int
add_rec_by_index(struct ftrace_hash *hash, struct ftrace_glob *func_g,
int clear_filter)
{
- long index = simple_strtoul(func_g->search, NULL, 0);
+ long index;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
/* The index starts at 1 */
- if (--index < 0)
+ if (kstrtoul(func_g->search, 0, &index) || --index < 0)
return 0;
do_for_each_ftrace_rec(pg, rec) {
@@ -4309,15 +4910,13 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
mod_g.len = strlen(mod_g.search);
}
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
if (unlikely(ftrace_disabled))
- goto out_unlock;
+ return 0;
- if (func_g.type == MATCH_INDEX) {
- found = add_rec_by_index(hash, &func_g, clear_filter);
- goto out_unlock;
- }
+ if (func_g.type == MATCH_INDEX)
+ return add_rec_by_index(hash, &func_g, clear_filter);
do_for_each_ftrace_rec(pg, rec) {
@@ -4326,16 +4925,12 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) {
ret = enter_record(hash, rec, clear_filter);
- if (ret < 0) {
- found = ret;
- goto out_unlock;
- }
+ if (ret < 0)
+ return ret;
found = 1;
}
cond_resched();
} while_for_each_ftrace_rec();
- out_unlock:
- mutex_unlock(&ftrace_lock);
return found;
}
@@ -4382,36 +4977,33 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
struct ftrace_hash *hash,
int enable)
{
- struct ftrace_ops_hash old_hash_ops;
- struct ftrace_hash *old_hash;
- int ret;
-
- old_hash = *orig_hash;
- old_hash_ops.filter_hash = ops->func_hash->filter_hash;
- old_hash_ops.notrace_hash = ops->func_hash->notrace_hash;
- ret = ftrace_hash_move(ops, enable, orig_hash, hash);
- if (!ret) {
- ftrace_ops_update_code(ops, &old_hash_ops);
- free_ftrace_hash_rcu(old_hash);
- }
- return ret;
-}
+ if (ops->flags & FTRACE_OPS_FL_SUBOP)
+ return ftrace_hash_move_and_update_subops(ops, orig_hash, hash);
-static bool module_exists(const char *module)
-{
- /* All modules have the symbol __this_module */
- static const char this_mod[] = "__this_module";
- char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2];
- unsigned long val;
- int n;
+ /*
+ * If this ops is not enabled, it could be sharing its filters
+ * with a subop. If that's the case, update the subop instead of
+ * this ops. Shared filters are only allowed to have one ops set
+ * at a time, and if we update the ops that is not enabled,
+ * it will not affect subops that share it.
+ */
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) {
+ struct ftrace_ops *op;
- n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod);
+ /* Check if any other manager subops maps to this hash */
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ struct ftrace_ops *subops;
- if (n > sizeof(modname) - 1)
- return false;
+ list_for_each_entry(subops, &op->subop_list, list) {
+ if ((subops->flags & FTRACE_OPS_FL_ENABLED) &&
+ subops->func_hash == ops->func_hash) {
+ return ftrace_hash_move_and_update_subops(subops, orig_hash, hash);
+ }
+ }
+ } while_for_each_ftrace_op(op);
+ }
- val = module_kallsyms_lookup_name(modname);
- return val != 0;
+ return __ftrace_hash_move_and_update_ops(ops, orig_hash, hash, enable);
}
static int cache_mod(struct trace_array *tr,
@@ -4419,14 +5011,14 @@ static int cache_mod(struct trace_array *tr,
{
struct ftrace_mod_load *ftrace_mod, *n;
struct list_head *head = enable ? &tr->mod_trace : &tr->mod_notrace;
- int ret;
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
/* We do not cache inverse filters */
if (func[0] == '!') {
+ int ret = -EINVAL;
+
func++;
- ret = -EINVAL;
/* Look to remove this hash */
list_for_each_entry_safe(ftrace_mod, n, head, list) {
@@ -4442,26 +5034,17 @@ static int cache_mod(struct trace_array *tr,
continue;
}
}
- goto out;
+ return ret;
}
- ret = -EINVAL;
/* We only care about modules that have not been loaded yet */
if (module_exists(module))
- goto out;
+ return -EINVAL;
/* Save this string off, and execute it when the module is loaded */
- ret = ftrace_add_mod(tr, func, module, enable);
- out:
- mutex_unlock(&ftrace_lock);
-
- return ret;
+ return ftrace_add_mod(tr, func, module, enable);
}
-static int
-ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
- int reset, int enable);
-
#ifdef CONFIG_MODULES
static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
char *mod, bool enable)
@@ -4565,6 +5148,9 @@ ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash,
char *func;
int ret;
+ if (!tr)
+ return -ENODEV;
+
/* match_records() modifies func, and we need the original */
func = kstrdup(func_orig, GFP_KERNEL);
if (!func)
@@ -4622,8 +5208,12 @@ struct ftrace_func_map {
void *data;
};
+/*
+ * Note, ftrace_func_mapper is freed by free_ftrace_hash(&mapper->hash).
+ * The hash field must be the first field.
+ */
struct ftrace_func_mapper {
- struct ftrace_hash hash;
+ struct ftrace_hash hash; /* Must be first! */
};
/**
@@ -4758,6 +5348,7 @@ void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper,
}
}
}
+ /* This also frees the mapper itself */
free_ftrace_hash(&mapper->hash);
}
@@ -4765,7 +5356,7 @@ static void release_probe(struct ftrace_func_probe *probe)
{
struct ftrace_probe_ops *probe_ops;
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
WARN_ON(probe->ref <= 0);
@@ -4783,7 +5374,6 @@ static void release_probe(struct ftrace_func_probe *probe)
list_del(&probe->list);
kfree(probe);
}
- mutex_unlock(&ftrace_lock);
}
static void acquire_probe_locked(struct ftrace_func_probe *probe)
@@ -5089,20 +5679,15 @@ static DEFINE_MUTEX(ftrace_cmd_mutex);
__init int register_ftrace_command(struct ftrace_func_command *cmd)
{
struct ftrace_func_command *p;
- int ret = 0;
- mutex_lock(&ftrace_cmd_mutex);
+ guard(mutex)(&ftrace_cmd_mutex);
list_for_each_entry(p, &ftrace_commands, list) {
- if (strcmp(cmd->name, p->name) == 0) {
- ret = -EBUSY;
- goto out_unlock;
- }
+ if (strcmp(cmd->name, p->name) == 0)
+ return -EBUSY;
}
list_add(&cmd->list, &ftrace_commands);
- out_unlock:
- mutex_unlock(&ftrace_cmd_mutex);
- return ret;
+ return 0;
}
/*
@@ -5112,20 +5697,17 @@ __init int register_ftrace_command(struct ftrace_func_command *cmd)
__init int unregister_ftrace_command(struct ftrace_func_command *cmd)
{
struct ftrace_func_command *p, *n;
- int ret = -ENODEV;
- mutex_lock(&ftrace_cmd_mutex);
+ guard(mutex)(&ftrace_cmd_mutex);
+
list_for_each_entry_safe(p, n, &ftrace_commands, list) {
if (strcmp(cmd->name, p->name) == 0) {
- ret = 0;
list_del_init(&p->list);
- goto out_unlock;
+ return 0;
}
}
- out_unlock:
- mutex_unlock(&ftrace_cmd_mutex);
- return ret;
+ return -ENODEV;
}
static int ftrace_process_regex(struct ftrace_iterator *iter,
@@ -5135,7 +5717,7 @@ static int ftrace_process_regex(struct ftrace_iterator *iter,
struct trace_array *tr = iter->ops->private;
char *func, *command, *next = buff;
struct ftrace_func_command *p;
- int ret = -EINVAL;
+ int ret;
func = strsep(&next, ":");
@@ -5152,17 +5734,14 @@ static int ftrace_process_regex(struct ftrace_iterator *iter,
command = strsep(&next, ":");
- mutex_lock(&ftrace_cmd_mutex);
+ guard(mutex)(&ftrace_cmd_mutex);
+
list_for_each_entry(p, &ftrace_commands, list) {
- if (strcmp(p->name, command) == 0) {
- ret = p->func(tr, hash, func, command, next, enable);
- goto out_unlock;
- }
+ if (strcmp(p->name, command) == 0)
+ return p->func(tr, hash, func, command, next, enable);
}
- out_unlock:
- mutex_unlock(&ftrace_cmd_mutex);
- return ret;
+ return -EINVAL;
}
static ssize_t
@@ -5196,12 +5775,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
parser->idx, enable);
trace_parser_clear(parser);
if (ret < 0)
- goto out;
+ return ret;
}
- ret = read;
- out:
- return ret;
+ return read;
}
ssize_t
@@ -5233,6 +5810,9 @@ __ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
return -ENOENT;
free_hash_entry(hash, entry);
return 0;
+ } else if (__ftrace_lookup_ip(hash, ip) != NULL) {
+ /* Already exists */
+ return 0;
}
entry = add_hash_entry(hash, ip);
@@ -5262,7 +5842,7 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long *ips,
static int
ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
unsigned long *ips, unsigned int cnt,
- int remove, int reset, int enable)
+ int remove, int reset, int enable, char *mod)
{
struct ftrace_hash **orig_hash;
struct ftrace_hash *hash;
@@ -5288,7 +5868,15 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
goto out_regex_unlock;
}
- if (buf && !ftrace_match_records(hash, buf, len)) {
+ if (buf && !match_records(hash, buf, len, mod)) {
+ /* If this was for a module and nothing was enabled, flag it */
+ if (mod)
+ (*orig_hash)->flags |= FTRACE_HASH_FL_MOD;
+
+ /*
+ * Even if it is a mod, return error to let caller know
+ * nothing was added
+ */
ret = -EINVAL;
goto out_regex_unlock;
}
@@ -5313,19 +5901,11 @@ static int
ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt,
int remove, int reset, int enable)
{
- return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable);
+ return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable, NULL);
}
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
-struct ftrace_direct_func {
- struct list_head next;
- unsigned long addr;
- int count;
-};
-
-static LIST_HEAD(ftrace_direct_funcs);
-
static int register_ftrace_function_nolock(struct ftrace_ops *ops);
/*
@@ -5366,6 +5946,13 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long
}
}
+static void register_ftrace_direct_cb(struct rcu_head *rhp)
+{
+ struct ftrace_hash *fhp = container_of(rhp, struct ftrace_hash, rcu);
+
+ free_ftrace_hash(fhp);
+}
+
/**
* register_ftrace_direct - Call a custom trampoline directly
* for multiple functions registered in @ops
@@ -5420,9 +6007,10 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
/* Make a copy hash to place the new and the old entries in */
size = hash->count + direct_functions->count;
- if (size > 32)
- size = 32;
- new_hash = alloc_ftrace_hash(fls(size));
+ size = fls(size);
+ if (size > FTRACE_HASH_MAX_BITS)
+ size = FTRACE_HASH_MAX_BITS;
+ new_hash = alloc_ftrace_hash(size);
if (!new_hash)
goto out_unlock;
@@ -5464,10 +6052,8 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
out_unlock:
mutex_unlock(&direct_mutex);
- if (free_hash && free_hash != EMPTY_HASH) {
- synchronize_rcu_tasks();
- free_ftrace_hash(free_hash);
- }
+ if (free_hash && free_hash != EMPTY_HASH)
+ call_rcu_tasks(&free_hash->rcu, register_ftrace_direct_cb);
if (new_hash)
free_ftrace_hash(new_hash);
@@ -5480,6 +6066,8 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct);
* unregister_ftrace_direct - Remove calls to custom trampoline
* previously registered by register_ftrace_direct for @ops object.
* @ops: The address of the struct ftrace_ops object
+ * @addr: The address of the direct function that is called by the @ops functions
+ * @free_filters: Set to true to remove all filters for the ftrace_ops, false otherwise
*
* This is used to remove a direct calls to @addr from the nop locations
* of the functions registered in @ops (with by ftrace_set_filter_ip
@@ -5692,7 +6280,38 @@ static int
ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
int reset, int enable)
{
- return ftrace_set_hash(ops, buf, len, NULL, 0, 0, reset, enable);
+ char *mod = NULL, *func, *command, *next = buf;
+ char *tmp __free(kfree) = NULL;
+ struct trace_array *tr = ops->private;
+ int ret;
+
+ func = strsep(&next, ":");
+
+ /* This can also handle :mod: parsing */
+ if (next) {
+ if (!tr)
+ return -EINVAL;
+
+ command = strsep(&next, ":");
+ if (strcmp(command, "mod") != 0)
+ return -EINVAL;
+
+ mod = next;
+ len = command - func;
+ /* Save the original func as ftrace_set_hash() can modify it */
+ tmp = kstrdup(func, GFP_KERNEL);
+ }
+
+ ret = ftrace_set_hash(ops, func, len, NULL, 0, 0, reset, enable, mod);
+
+ if (tr && mod && ret < 0) {
+ /* Did tmp fail to allocate? */
+ if (!tmp)
+ return -ENOMEM;
+ ret = cache_mod(tr, tmp, mod, enable);
+ }
+
+ return ret;
}
/**
@@ -5817,9 +6436,8 @@ __setup("ftrace_graph_notrace=", set_graph_notrace_function);
static int __init set_graph_max_depth_function(char *str)
{
- if (!str)
+ if (!str || kstrtouint(str, 0, &fgraph_max_depth))
return 0;
- fgraph_max_depth = simple_strtoul(str, NULL, 0);
return 1;
}
__setup("ftrace_graph_max_depth=", set_graph_max_depth_function);
@@ -5857,6 +6475,14 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)
ftrace_ops_init(ops);
+ /* The trace_array is needed for caching module function filters */
+ if (!ops->private) {
+ struct trace_array *tr = trace_get_global_array();
+
+ ops->private = tr;
+ ftrace_init_trace_array(tr);
+ }
+
while (buf) {
func = strsep(&buf, ",");
ftrace_set_regex(ops, func, strlen(func), 0, enable);
@@ -6296,12 +6922,10 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer)
func_g.len = strlen(func_g.search);
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
- if (unlikely(ftrace_disabled)) {
- mutex_unlock(&ftrace_lock);
+ if (unlikely(ftrace_disabled))
return -ENODEV;
- }
do_for_each_ftrace_rec(pg, rec) {
@@ -6317,7 +6941,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer)
if (entry)
continue;
if (add_hash_entry(hash, rec->ip) == NULL)
- goto out;
+ return 0;
} else {
if (entry) {
free_hash_entry(hash, entry);
@@ -6325,14 +6949,10 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer)
}
}
}
+ cond_resched();
} while_for_each_ftrace_rec();
-out:
- mutex_unlock(&ftrace_lock);
-
- if (fail)
- return -EINVAL;
- return 0;
+ return fail ? -EINVAL : 0;
}
static ssize_t
@@ -6493,6 +7113,7 @@ static int ftrace_process_locs(struct module *mod,
unsigned long *p;
unsigned long addr;
unsigned long flags = 0; /* Shut up gcc */
+ unsigned long pages;
int ret = -ENOMEM;
count = end - start;
@@ -6500,6 +7121,8 @@ static int ftrace_process_locs(struct module *mod,
if (!count)
return 0;
+ pages = DIV_ROUND_UP(count, ENTRIES_PER_PAGE);
+
/*
* Sorting mcount in vmlinux at build time depend on
* CONFIG_BUILDTIME_MCOUNT_SORT, while mcount loc in
@@ -6544,7 +7167,9 @@ static int ftrace_process_locs(struct module *mod,
pg = start_pg;
while (p < end) {
unsigned long end_offset;
- addr = ftrace_call_adjust(*p++);
+
+ addr = *p++;
+
/*
* Some architecture linkers will pad between
* the different mcount_loc sections of different
@@ -6556,6 +7181,19 @@ static int ftrace_process_locs(struct module *mod,
continue;
}
+ /*
+ * If this is core kernel, make sure the address is in core
+ * or inittext, as weak functions get zeroed and KASLR can
+ * move them to something other than zero. It just will not
+ * move it to an area where kernel text is.
+ */
+ if (!mod && !(is_kernel_text(addr) || is_kernel_inittext(addr))) {
+ skipped++;
+ continue;
+ }
+
+ addr = ftrace_call_adjust(addr);
+
end_offset = (pg->index+1) * sizeof(pg->records[0]);
if (end_offset > PAGE_SIZE << pg->order) {
/* We should have allocated enough */
@@ -6595,9 +7233,41 @@ static int ftrace_process_locs(struct module *mod,
/* We should have used all pages unless we skipped some */
if (pg_unuse) {
- WARN_ON(!skipped);
+ unsigned long pg_remaining, remaining = 0;
+ unsigned long skip;
+
+ /* Count the number of entries unused and compare it to skipped. */
+ pg_remaining = (ENTRIES_PER_PAGE << pg->order) - pg->index;
+
+ if (!WARN(skipped < pg_remaining, "Extra allocated pages for ftrace")) {
+
+ skip = skipped - pg_remaining;
+
+ for (pg = pg_unuse; pg; pg = pg->next)
+ remaining += 1 << pg->order;
+
+ pages -= remaining;
+
+ skip = DIV_ROUND_UP(skip, ENTRIES_PER_PAGE);
+
+ /*
+ * Check to see if the number of pages remaining would
+ * just fit the number of entries skipped.
+ */
+ WARN(skip != remaining, "Extra allocated pages for ftrace: %lu with %lu skipped",
+ remaining, skipped);
+ }
+ /* Need to synchronize with ftrace_location_range() */
+ synchronize_rcu();
ftrace_free_pages(pg_unuse);
}
+
+ if (!mod) {
+ count -= skipped;
+ pr_info("ftrace: allocating %ld entries in %ld pages\n",
+ count, pages);
+ }
+
return ret;
}
@@ -6809,6 +7479,9 @@ void ftrace_release_mod(struct module *mod)
out_unlock:
mutex_unlock(&ftrace_lock);
+ /* Need to synchronize with ftrace_location_range() */
+ if (tmp_page)
+ synchronize_rcu();
for (pg = tmp_page; pg; pg = tmp_page) {
/* Needs to be called outside of ftrace_lock */
@@ -6970,7 +7643,7 @@ allocate_ftrace_mod_map(struct module *mod,
return mod_map;
}
-static const char *
+static int
ftrace_func_address_lookup(struct ftrace_mod_map *mod_map,
unsigned long addr, unsigned long *size,
unsigned long *off, char *sym)
@@ -6991,21 +7664,18 @@ ftrace_func_address_lookup(struct ftrace_mod_map *mod_map,
*size = found_func->size;
if (off)
*off = addr - found_func->ip;
- if (sym)
- strscpy(sym, found_func->name, KSYM_NAME_LEN);
-
- return found_func->name;
+ return strscpy(sym, found_func->name, KSYM_NAME_LEN);
}
- return NULL;
+ return 0;
}
-const char *
+int
ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
unsigned long *off, char **modname, char *sym)
{
struct ftrace_mod_map *mod_map;
- const char *ret = NULL;
+ int ret = 0;
/* mod_map is freed via call_rcu() */
preempt_disable();
@@ -7142,6 +7812,7 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
unsigned long start = (unsigned long)(start_ptr);
unsigned long end = (unsigned long)(end_ptr);
struct ftrace_page **last_pg = &ftrace_pages_start;
+ struct ftrace_page *tmp_page = NULL;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
struct dyn_ftrace key;
@@ -7183,12 +7854,8 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
ftrace_update_tot_cnt--;
if (!pg->index) {
*last_pg = pg->next;
- if (pg->records) {
- free_pages((unsigned long)pg->records, pg->order);
- ftrace_number_of_pages -= 1 << pg->order;
- }
- ftrace_number_of_groups--;
- kfree(pg);
+ pg->next = tmp_page;
+ tmp_page = pg;
pg = container_of(last_pg, struct ftrace_page, next);
if (!(*last_pg))
ftrace_pages = pg;
@@ -7205,6 +7872,11 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
clear_func_from_hashes(func);
kfree(func);
}
+ /* Need to synchronize with ftrace_location_range() */
+ if (tmp_page) {
+ synchronize_rcu();
+ ftrace_free_pages(tmp_page);
+ }
}
void __init ftrace_free_init_mem(void)
@@ -7241,9 +7913,6 @@ void __init ftrace_init(void)
goto failed;
}
- pr_info("ftrace: allocating %ld entries in %ld pages\n",
- count, DIV_ROUND_UP(count, ENTRIES_PER_PAGE));
-
ret = ftrace_process_locs(NULL,
__start_mcount_loc,
__stop_mcount_loc);
@@ -7293,9 +7962,14 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops)
void ftrace_init_trace_array(struct trace_array *tr)
{
+ if (tr->flags & TRACE_ARRAY_FL_MOD_INIT)
+ return;
+
INIT_LIST_HEAD(&tr->func_probes);
INIT_LIST_HEAD(&tr->mod_trace);
INIT_LIST_HEAD(&tr->mod_notrace);
+
+ tr->flags |= TRACE_ARRAY_FL_MOD_INIT;
}
#else
@@ -7324,8 +7998,10 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops)
__init void ftrace_init_global_array_ops(struct trace_array *tr)
{
tr->ops = &global_ops;
- tr->ops->private = tr;
+ if (!global_ops.private)
+ global_ops.private = tr;
ftrace_init_trace_array(tr);
+ init_array_fgraph_ops(tr, tr->ops);
}
void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
@@ -7406,6 +8082,7 @@ out:
void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs)
{
+ kmsan_unpoison_memory(fregs, ftrace_regs_size());
__ftrace_ops_list_func(ip, parent_ip, NULL, fregs);
}
#else
@@ -7764,7 +8441,7 @@ pid_write(struct file *filp, const char __user *ubuf,
if (!cnt)
return 0;
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
switch (type) {
case TRACE_PIDS:
@@ -7780,14 +8457,13 @@ pid_write(struct file *filp, const char __user *ubuf,
lockdep_is_held(&ftrace_lock));
break;
default:
- ret = -EINVAL;
WARN_ON_ONCE(1);
- goto out;
+ return -EINVAL;
}
ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
if (ret < 0)
- goto out;
+ return ret;
switch (type) {
case TRACE_PIDS:
@@ -7816,11 +8492,8 @@ pid_write(struct file *filp, const char __user *ubuf,
ftrace_update_pid_func();
ftrace_startup_all(0);
- out:
- mutex_unlock(&ftrace_lock);
- if (ret > 0)
- *ppos += ret;
+ *ppos += ret;
return ret;
}
@@ -7895,6 +8568,7 @@ void ftrace_kill(void)
ftrace_disabled = 1;
ftrace_enabled = 0;
ftrace_trace_function = ftrace_stub;
+ kprobe_ftrace_kill();
}
/**
@@ -8219,20 +8893,20 @@ static bool is_permanent_ops_registered(void)
}
static int
-ftrace_enable_sysctl(struct ctl_table *table, int write,
+ftrace_enable_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- int ret = -ENODEV;
+ int ret;
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
if (unlikely(ftrace_disabled))
- goto out;
+ return -ENODEV;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
- goto out;
+ return ret;
if (ftrace_enabled) {
@@ -8246,8 +8920,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
} else {
if (is_permanent_ops_registered()) {
ftrace_enabled = true;
- ret = -EBUSY;
- goto out;
+ return -EBUSY;
}
/* stopping ftrace calls (just send to ftrace_stub) */
@@ -8257,12 +8930,10 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
}
last_ftrace_enabled = !!ftrace_enabled;
- out:
- mutex_unlock(&ftrace_lock);
- return ret;
+ return 0;
}
-static struct ctl_table ftrace_sysctls[] = {
+static const struct ctl_table ftrace_sysctls[] = {
{
.procname = "ftrace_enabled",
.data = &ftrace_enabled,
@@ -8270,7 +8941,6 @@ static struct ctl_table ftrace_sysctls[] = {
.mode = 0644,
.proc_handler = ftrace_enable_sysctl,
},
- {}
};
static int __init ftrace_sysctl_init(void)
diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h
index 5012c04f92c0..3235470e61b3 100644
--- a/kernel/trace/ftrace_internal.h
+++ b/kernel/trace/ftrace_internal.h
@@ -15,6 +15,8 @@ extern struct ftrace_ops global_ops;
int ftrace_startup(struct ftrace_ops *ops, int command);
int ftrace_shutdown(struct ftrace_ops *ops, int command);
int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs);
+int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command);
+int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command);
#else /* !CONFIG_DYNAMIC_FTRACE */
@@ -38,14 +40,26 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
{
return 1;
}
+static inline int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command)
+{
+ return -EINVAL;
+}
+static inline int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command)
+{
+ return -EINVAL;
+}
#endif /* CONFIG_DYNAMIC_FTRACE */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
extern int ftrace_graph_active;
-void update_function_graph_func(void);
+# ifdef CONFIG_DYNAMIC_FTRACE
+extern void fgraph_update_pid_func(void);
+# else
+static inline void fgraph_update_pid_func(void) {}
+# endif
#else /* !CONFIG_FUNCTION_GRAPH_TRACER */
# define ftrace_graph_active 0
-static inline void update_function_graph_func(void) { }
+static inline void fgraph_update_pid_func(void) {}
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
#else /* !CONFIG_FUNCTION_TRACER */
diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index 95106d02b32d..090bb5ea4a19 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -81,13 +81,9 @@ static inline bool upper_empty(union upper_chunk *chunk)
{
/*
* If chunk->data has no lower chunks, it will be the same
- * as a zeroed bitmask. Use find_first_bit() to test it
- * and if it doesn't find any bits set, then the array
- * is empty.
+ * as a zeroed bitmask.
*/
- int bit = find_first_bit((unsigned long *)chunk->data,
- sizeof(chunk->data) * 8);
- return bit >= sizeof(chunk->data) * 8;
+ return bitmap_empty((unsigned long *)chunk->data, BITS_PER_TYPE(chunk->data));
}
static inline int pid_split(unsigned int pid, unsigned int *upper1,
@@ -354,7 +350,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
while (upper_count-- > 0) {
union upper_chunk *chunk;
- chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+ chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT);
if (!chunk)
break;
*upper_next = chunk;
@@ -365,7 +361,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
while (lower_count-- > 0) {
union lower_chunk *chunk;
- chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+ chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT);
if (!chunk)
break;
*lower_next = chunk;
@@ -414,7 +410,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
int i;
/* According to linux/thread.h, pids can be no bigger that 30 bits */
- WARN_ON_ONCE(pid_max > (1 << 30));
+ WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30));
pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL);
if (!pid_list)
@@ -451,6 +447,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
/**
* trace_pid_list_free - Frees an allocated pid_list.
+ * @pid_list: The pid list to free.
*
* Frees the memory for a pid_list that was allocated.
*/
diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c
index 8c4ffd076162..314ffc143039 100644
--- a/kernel/trace/preemptirq_delay_test.c
+++ b/kernel/trace/preemptirq_delay_test.c
@@ -34,8 +34,6 @@ MODULE_PARM_DESC(cpu_affinity, "Cpu num test is running on");
static struct completion done;
-#define MIN(x, y) ((x) < (y) ? (x) : (y))
-
static void busy_wait(ulong time)
{
u64 start, end;
@@ -215,4 +213,5 @@ static void __exit preemptirq_delay_exit(void)
module_init(preemptirq_delay_init)
module_exit(preemptirq_delay_exit)
+MODULE_DESCRIPTION("Preempt / IRQ disable delay thread to test latency tracers");
MODULE_LICENSE("GPL v2");
diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
index fa03094e9e69..30d224946881 100644
--- a/kernel/trace/rethook.c
+++ b/kernel/trace/rethook.c
@@ -166,6 +166,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh)
if (unlikely(!handler))
return NULL;
+#if defined(CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING) || defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE)
/*
* This expects the caller will set up a rethook on a function entry.
* When the function returns, the rethook will eventually be reclaimed
@@ -174,6 +175,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh)
*/
if (unlikely(!rcu_is_watching()))
return NULL;
+#endif
return (struct rethook_node *)objpool_pop(&rh->pool);
}
@@ -248,7 +250,7 @@ unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame
if (WARN_ON_ONCE(!cur))
return 0;
- if (WARN_ON_ONCE(tsk != current && task_is_running(tsk)))
+ if (tsk != current && task_is_running(tsk))
return 0;
do {
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6511dc3a00da..e24509bd0af5 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -9,6 +9,7 @@
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
#include <linux/sched/clock.h>
+#include <linux/cacheflush.h>
#include <linux/trace_seq.h>
#include <linux/spinlock.h>
#include <linux/irq_work.h>
@@ -26,9 +27,13 @@
#include <linux/list.h>
#include <linux/cpu.h>
#include <linux/oom.h>
+#include <linux/mm.h>
#include <asm/local64.h>
#include <asm/local.h>
+#include <asm/setup.h>
+
+#include "trace.h"
/*
* The "absolute" timestamp in the buffer is only 59 bits.
@@ -40,6 +45,24 @@
static void update_pages_handler(struct work_struct *work);
+#define RING_BUFFER_META_MAGIC 0xBADFEED
+
+struct ring_buffer_meta {
+ int magic;
+ int struct_sizes;
+ unsigned long total_size;
+ unsigned long buffers_offset;
+};
+
+struct ring_buffer_cpu_meta {
+ unsigned long first_buffer;
+ unsigned long head_buffer;
+ unsigned long commit_buffer;
+ __u32 subbuf_size;
+ __u32 nr_subbufs;
+ int buffers[];
+};
+
/*
* The ring buffer header is special. We must manually up keep it.
*/
@@ -312,6 +335,8 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event)
/* Missed count stored at end */
#define RB_MISSED_STORED (1 << 30)
+#define RB_MISSED_MASK (3 << 30)
+
struct buffer_data_page {
u64 time_stamp; /* page time stamp */
local_t commit; /* write committed index */
@@ -338,6 +363,8 @@ struct buffer_page {
local_t entries; /* entries on this page */
unsigned long real_end; /* real end of data */
unsigned order; /* order of the page */
+ u32 id:30; /* ID for external mapping */
+ u32 range:1; /* Mapped via a range */
struct buffer_data_page *page; /* Actual data page */
};
@@ -368,7 +395,9 @@ static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
static void free_buffer_page(struct buffer_page *bpage)
{
- free_pages((unsigned long)bpage->page, bpage->order);
+ /* Range pages are not to be freed */
+ if (!bpage->range)
+ free_pages((unsigned long)bpage->page, bpage->order);
kfree(bpage);
}
@@ -457,6 +486,8 @@ struct ring_buffer_per_cpu {
unsigned long nr_pages;
unsigned int current_context;
struct list_head *pages;
+ /* pages generation counter, incremented when the list changes */
+ unsigned long cnt;
struct buffer_page *head_page; /* read from head */
struct buffer_page *tail_page; /* write to tail */
struct buffer_page *commit_page; /* committed pages */
@@ -484,6 +515,14 @@ struct ring_buffer_per_cpu {
u64 read_stamp;
/* pages removed since last reset */
unsigned long pages_removed;
+
+ unsigned int mapped;
+ unsigned int user_mapped; /* user space mapping */
+ struct mutex mapping_lock;
+ unsigned long *subbuf_ids; /* ID to subbuf VA */
+ struct trace_buffer_meta *meta_page;
+ struct ring_buffer_cpu_meta *ring_meta;
+
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
struct list_head new_pages; /* new pages to add */
@@ -512,6 +551,11 @@ struct trace_buffer {
struct rb_irq_work irq_work;
bool time_stamp_abs;
+ unsigned long range_addr_start;
+ unsigned long range_addr_end;
+
+ struct ring_buffer_meta *meta;
+
unsigned int subbuf_size;
unsigned int subbuf_order;
unsigned int max_data_size;
@@ -682,18 +726,6 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer,
}
/**
- * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer
- * @buffer: The ring_buffer to get the number of pages from
- * @cpu: The cpu of the ring_buffer to get the number of pages from
- *
- * Returns the number of pages used by a per_cpu buffer of the ring buffer.
- */
-size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu)
-{
- return buffer->buffers[cpu]->nr_pages;
-}
-
-/**
* ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer
* @buffer: The ring_buffer to get the number of pages from
* @cpu: The cpu of the ring_buffer to get the number of pages from
@@ -1240,6 +1272,11 @@ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
* Set the previous list pointer to have the HEAD flag.
*/
rb_set_list_to_head(head->list.prev);
+
+ if (cpu_buffer->ring_meta) {
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
+ meta->head_buffer = (unsigned long)head->page;
+ }
}
static void rb_list_head_clear(struct list_head *list)
@@ -1443,6 +1480,20 @@ static void rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK);
}
+static bool rb_check_links(struct ring_buffer_per_cpu *cpu_buffer,
+ struct list_head *list)
+{
+ if (RB_WARN_ON(cpu_buffer,
+ rb_list_head(rb_list_head(list->next)->prev) != list))
+ return false;
+
+ if (RB_WARN_ON(cpu_buffer,
+ rb_list_head(rb_list_head(list->prev)->next) != list))
+ return false;
+
+ return true;
+}
+
/**
* rb_check_pages - integrity check of buffer pages
* @cpu_buffer: CPU buffer with pages to test
@@ -1452,31 +1503,607 @@ static void rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
*/
static void rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
- struct list_head *head = rb_list_head(cpu_buffer->pages);
- struct list_head *tmp;
+ struct list_head *head, *tmp;
+ unsigned long buffer_cnt;
+ unsigned long flags;
+ int nr_loops = 0;
- if (RB_WARN_ON(cpu_buffer,
- rb_list_head(rb_list_head(head->next)->prev) != head))
+ /*
+ * Walk the linked list underpinning the ring buffer and validate all
+ * its next and prev links.
+ *
+ * The check acquires the reader_lock to avoid concurrent processing
+ * with code that could be modifying the list. However, the lock cannot
+ * be held for the entire duration of the walk, as this would make the
+ * time when interrupts are disabled non-deterministic, dependent on the
+ * ring buffer size. Therefore, the code releases and re-acquires the
+ * lock after checking each page. The ring_buffer_per_cpu.cnt variable
+ * is then used to detect if the list was modified while the lock was
+ * not held, in which case the check needs to be restarted.
+ *
+ * The code attempts to perform the check at most three times before
+ * giving up. This is acceptable because this is only a self-validation
+ * to detect problems early on. In practice, the list modification
+ * operations are fairly spaced, and so this check typically succeeds at
+ * most on the second try.
+ */
+again:
+ if (++nr_loops > 3)
return;
- if (RB_WARN_ON(cpu_buffer,
- rb_list_head(rb_list_head(head->prev)->next) != head))
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ head = rb_list_head(cpu_buffer->pages);
+ if (!rb_check_links(cpu_buffer, head))
+ goto out_locked;
+ buffer_cnt = cpu_buffer->cnt;
+ tmp = head;
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ while (true) {
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+ if (buffer_cnt != cpu_buffer->cnt) {
+ /* The list was updated, try again. */
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ goto again;
+ }
+
+ tmp = rb_list_head(tmp->next);
+ if (tmp == head)
+ /* The iteration circled back, all is done. */
+ goto out_locked;
+
+ if (!rb_check_links(cpu_buffer, tmp))
+ goto out_locked;
+
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ }
+
+out_locked:
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+}
+
+/*
+ * Take an address, add the meta data size as well as the array of
+ * array subbuffer indexes, then align it to a subbuffer size.
+ *
+ * This is used to help find the next per cpu subbuffer within a mapped range.
+ */
+static unsigned long
+rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs)
+{
+ addr += sizeof(struct ring_buffer_cpu_meta) +
+ sizeof(int) * nr_subbufs;
+ return ALIGN(addr, subbuf_size);
+}
+
+/*
+ * Return the ring_buffer_meta for a given @cpu.
+ */
+static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu)
+{
+ int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+ struct ring_buffer_cpu_meta *meta;
+ struct ring_buffer_meta *bmeta;
+ unsigned long ptr;
+ int nr_subbufs;
+
+ bmeta = buffer->meta;
+ if (!bmeta)
+ return NULL;
+
+ ptr = (unsigned long)bmeta + bmeta->buffers_offset;
+ meta = (struct ring_buffer_cpu_meta *)ptr;
+
+ /* When nr_pages passed in is zero, the first meta has already been initialized */
+ if (!nr_pages) {
+ nr_subbufs = meta->nr_subbufs;
+ } else {
+ /* Include the reader page */
+ nr_subbufs = nr_pages + 1;
+ }
+
+ /*
+ * The first chunk may not be subbuffer aligned, where as
+ * the rest of the chunks are.
+ */
+ if (cpu) {
+ ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs);
+ ptr += subbuf_size * nr_subbufs;
+
+ /* We can use multiplication to find chunks greater than 1 */
+ if (cpu > 1) {
+ unsigned long size;
+ unsigned long p;
+
+ /* Save the beginning of this CPU chunk */
+ p = ptr;
+ ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs);
+ ptr += subbuf_size * nr_subbufs;
+
+ /* Now all chunks after this are the same size */
+ size = ptr - p;
+ ptr += size * (cpu - 2);
+ }
+ }
+ return (void *)ptr;
+}
+
+/* Return the start of subbufs given the meta pointer */
+static void *rb_subbufs_from_meta(struct ring_buffer_cpu_meta *meta)
+{
+ int subbuf_size = meta->subbuf_size;
+ unsigned long ptr;
+
+ ptr = (unsigned long)meta;
+ ptr = rb_range_align_subbuf(ptr, subbuf_size, meta->nr_subbufs);
+
+ return (void *)ptr;
+}
+
+/*
+ * Return a specific sub-buffer for a given @cpu defined by @idx.
+ */
+static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx)
+{
+ struct ring_buffer_cpu_meta *meta;
+ unsigned long ptr;
+ int subbuf_size;
+
+ meta = rb_range_meta(cpu_buffer->buffer, 0, cpu_buffer->cpu);
+ if (!meta)
+ return NULL;
+
+ if (WARN_ON_ONCE(idx >= meta->nr_subbufs))
+ return NULL;
+
+ subbuf_size = meta->subbuf_size;
+
+ /* Map this buffer to the order that's in meta->buffers[] */
+ idx = meta->buffers[idx];
+
+ ptr = (unsigned long)rb_subbufs_from_meta(meta);
+
+ ptr += subbuf_size * idx;
+ if (ptr + subbuf_size > cpu_buffer->buffer->range_addr_end)
+ return NULL;
+
+ return (void *)ptr;
+}
+
+/*
+ * See if the existing memory contains a valid meta section.
+ * if so, use that, otherwise initialize it.
+ */
+static bool rb_meta_init(struct trace_buffer *buffer, int scratch_size)
+{
+ unsigned long ptr = buffer->range_addr_start;
+ struct ring_buffer_meta *bmeta;
+ unsigned long total_size;
+ int struct_sizes;
+
+ bmeta = (struct ring_buffer_meta *)ptr;
+ buffer->meta = bmeta;
+
+ total_size = buffer->range_addr_end - buffer->range_addr_start;
+
+ struct_sizes = sizeof(struct ring_buffer_cpu_meta);
+ struct_sizes |= sizeof(*bmeta) << 16;
+
+ /* The first buffer will start word size after the meta page */
+ ptr += sizeof(*bmeta);
+ ptr = ALIGN(ptr, sizeof(long));
+ ptr += scratch_size;
+
+ if (bmeta->magic != RING_BUFFER_META_MAGIC) {
+ pr_info("Ring buffer boot meta mismatch of magic\n");
+ goto init;
+ }
+
+ if (bmeta->struct_sizes != struct_sizes) {
+ pr_info("Ring buffer boot meta mismatch of struct size\n");
+ goto init;
+ }
+
+ if (bmeta->total_size != total_size) {
+ pr_info("Ring buffer boot meta mismatch of total size\n");
+ goto init;
+ }
+
+ if (bmeta->buffers_offset > bmeta->total_size) {
+ pr_info("Ring buffer boot meta mismatch of offset outside of total size\n");
+ goto init;
+ }
+
+ if (bmeta->buffers_offset != (void *)ptr - (void *)bmeta) {
+ pr_info("Ring buffer boot meta mismatch of first buffer offset\n");
+ goto init;
+ }
+
+ return true;
+
+ init:
+ bmeta->magic = RING_BUFFER_META_MAGIC;
+ bmeta->struct_sizes = struct_sizes;
+ bmeta->total_size = total_size;
+ bmeta->buffers_offset = (void *)ptr - (void *)bmeta;
+
+ /* Zero out the scatch pad */
+ memset((void *)bmeta + sizeof(*bmeta), 0, bmeta->buffers_offset - sizeof(*bmeta));
+
+ return false;
+}
+
+/*
+ * See if the existing memory contains valid ring buffer data.
+ * As the previous kernel must be the same as this kernel, all
+ * the calculations (size of buffers and number of buffers)
+ * must be the same.
+ */
+static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
+ struct trace_buffer *buffer, int nr_pages,
+ unsigned long *subbuf_mask)
+{
+ int subbuf_size = PAGE_SIZE;
+ struct buffer_data_page *subbuf;
+ unsigned long buffers_start;
+ unsigned long buffers_end;
+ int i;
+
+ if (!subbuf_mask)
+ return false;
+
+ buffers_start = meta->first_buffer;
+ buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs);
+
+ /* Is the head and commit buffers within the range of buffers? */
+ if (meta->head_buffer < buffers_start ||
+ meta->head_buffer >= buffers_end) {
+ pr_info("Ring buffer boot meta [%d] head buffer out of range\n", cpu);
+ return false;
+ }
+
+ if (meta->commit_buffer < buffers_start ||
+ meta->commit_buffer >= buffers_end) {
+ pr_info("Ring buffer boot meta [%d] commit buffer out of range\n", cpu);
+ return false;
+ }
+
+ subbuf = rb_subbufs_from_meta(meta);
+
+ bitmap_clear(subbuf_mask, 0, meta->nr_subbufs);
+
+ /* Is the meta buffers and the subbufs themselves have correct data? */
+ for (i = 0; i < meta->nr_subbufs; i++) {
+ if (meta->buffers[i] < 0 ||
+ meta->buffers[i] >= meta->nr_subbufs) {
+ pr_info("Ring buffer boot meta [%d] array out of range\n", cpu);
+ return false;
+ }
+
+ if ((unsigned)local_read(&subbuf->commit) > subbuf_size) {
+ pr_info("Ring buffer boot meta [%d] buffer invalid commit\n", cpu);
+ return false;
+ }
+
+ if (test_bit(meta->buffers[i], subbuf_mask)) {
+ pr_info("Ring buffer boot meta [%d] array has duplicates\n", cpu);
+ return false;
+ }
+
+ set_bit(meta->buffers[i], subbuf_mask);
+ subbuf = (void *)subbuf + subbuf_size;
+ }
+
+ return true;
+}
+
+static int rb_meta_subbuf_idx(struct ring_buffer_cpu_meta *meta, void *subbuf);
+
+static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu,
+ unsigned long long *timestamp, u64 *delta_ptr)
+{
+ struct ring_buffer_event *event;
+ u64 ts, delta;
+ int events = 0;
+ int e;
+
+ *delta_ptr = 0;
+ *timestamp = 0;
+
+ ts = dpage->time_stamp;
+
+ for (e = 0; e < tail; e += rb_event_length(event)) {
+
+ event = (struct ring_buffer_event *)(dpage->data + e);
+
+ switch (event->type_len) {
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ delta = rb_event_time_stamp(event);
+ ts += delta;
+ break;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ delta = rb_event_time_stamp(event);
+ delta = rb_fix_abs_ts(delta, ts);
+ if (delta < ts) {
+ *delta_ptr = delta;
+ *timestamp = ts;
+ return -1;
+ }
+ ts = delta;
+ break;
+
+ case RINGBUF_TYPE_PADDING:
+ if (event->time_delta == 1)
+ break;
+ fallthrough;
+ case RINGBUF_TYPE_DATA:
+ events++;
+ ts += event->time_delta;
+ break;
+
+ default:
+ return -1;
+ }
+ }
+ *timestamp = ts;
+ return events;
+}
+
+static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
+{
+ unsigned long long ts;
+ u64 delta;
+ int tail;
+
+ tail = local_read(&dpage->commit);
+ return rb_read_data_buffer(dpage, tail, cpu, &ts, &delta);
+}
+
+/* If the meta data has been validated, now validate the events */
+static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
+ struct buffer_page *head_page;
+ unsigned long entry_bytes = 0;
+ unsigned long entries = 0;
+ int ret;
+ int i;
+
+ if (!meta || !meta->head_buffer)
return;
- for (tmp = rb_list_head(head->next); tmp != head; tmp = rb_list_head(tmp->next)) {
- if (RB_WARN_ON(cpu_buffer,
- rb_list_head(rb_list_head(tmp->next)->prev) != tmp))
- return;
+ /* Do the reader page first */
+ ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu);
+ if (ret < 0) {
+ pr_info("Ring buffer reader page is invalid\n");
+ goto invalid;
+ }
+ entries += ret;
+ entry_bytes += local_read(&cpu_buffer->reader_page->page->commit);
+ local_set(&cpu_buffer->reader_page->entries, ret);
- if (RB_WARN_ON(cpu_buffer,
- rb_list_head(rb_list_head(tmp->prev)->next) != tmp))
- return;
+ head_page = cpu_buffer->head_page;
+
+ /* If the commit_buffer is the reader page, update the commit page */
+ if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) {
+ cpu_buffer->commit_page = cpu_buffer->reader_page;
+ /* Nothing more to do, the only page is the reader page */
+ goto done;
+ }
+
+ /* Iterate until finding the commit page */
+ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) {
+
+ /* Reader page has already been done */
+ if (head_page == cpu_buffer->reader_page)
+ continue;
+
+ ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
+ if (ret < 0) {
+ pr_info("Ring buffer meta [%d] invalid buffer page\n",
+ cpu_buffer->cpu);
+ goto invalid;
+ }
+
+ /* If the buffer has content, update pages_touched */
+ if (ret)
+ local_inc(&cpu_buffer->pages_touched);
+
+ entries += ret;
+ entry_bytes += local_read(&head_page->page->commit);
+ local_set(&cpu_buffer->head_page->entries, ret);
+
+ if (head_page == cpu_buffer->commit_page)
+ break;
+ }
+
+ if (head_page != cpu_buffer->commit_page) {
+ pr_info("Ring buffer meta [%d] commit page not found\n",
+ cpu_buffer->cpu);
+ goto invalid;
+ }
+ done:
+ local_set(&cpu_buffer->entries, entries);
+ local_set(&cpu_buffer->entries_bytes, entry_bytes);
+
+ pr_info("Ring buffer meta [%d] is from previous boot!\n", cpu_buffer->cpu);
+ return;
+
+ invalid:
+ /* The content of the buffers are invalid, reset the meta data */
+ meta->head_buffer = 0;
+ meta->commit_buffer = 0;
+
+ /* Reset the reader page */
+ local_set(&cpu_buffer->reader_page->entries, 0);
+ local_set(&cpu_buffer->reader_page->page->commit, 0);
+
+ /* Reset all the subbuffers */
+ for (i = 0; i < meta->nr_subbufs - 1; i++, rb_inc_page(&head_page)) {
+ local_set(&head_page->entries, 0);
+ local_set(&head_page->page->commit, 0);
+ }
+}
+
+static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages, int scratch_size)
+{
+ struct ring_buffer_cpu_meta *meta;
+ unsigned long *subbuf_mask;
+ unsigned long delta;
+ void *subbuf;
+ bool valid = false;
+ int cpu;
+ int i;
+
+ /* Create a mask to test the subbuf array */
+ subbuf_mask = bitmap_alloc(nr_pages + 1, GFP_KERNEL);
+ /* If subbuf_mask fails to allocate, then rb_meta_valid() will return false */
+
+ if (rb_meta_init(buffer, scratch_size))
+ valid = true;
+
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+ void *next_meta;
+
+ meta = rb_range_meta(buffer, nr_pages, cpu);
+
+ if (valid && rb_cpu_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) {
+ /* Make the mappings match the current address */
+ subbuf = rb_subbufs_from_meta(meta);
+ delta = (unsigned long)subbuf - meta->first_buffer;
+ meta->first_buffer += delta;
+ meta->head_buffer += delta;
+ meta->commit_buffer += delta;
+ continue;
+ }
+
+ if (cpu < nr_cpu_ids - 1)
+ next_meta = rb_range_meta(buffer, nr_pages, cpu + 1);
+ else
+ next_meta = (void *)buffer->range_addr_end;
+
+ memset(meta, 0, next_meta - (void *)meta);
+
+ meta->nr_subbufs = nr_pages + 1;
+ meta->subbuf_size = PAGE_SIZE;
+
+ subbuf = rb_subbufs_from_meta(meta);
+
+ meta->first_buffer = (unsigned long)subbuf;
+
+ /*
+ * The buffers[] array holds the order of the sub-buffers
+ * that are after the meta data. The sub-buffers may
+ * be swapped out when read and inserted into a different
+ * location of the ring buffer. Although their addresses
+ * remain the same, the buffers[] array contains the
+ * index into the sub-buffers holding their actual order.
+ */
+ for (i = 0; i < meta->nr_subbufs; i++) {
+ meta->buffers[i] = i;
+ rb_init_page(subbuf);
+ subbuf += meta->subbuf_size;
+ }
+ }
+ bitmap_free(subbuf_mask);
+}
+
+static void *rbm_start(struct seq_file *m, loff_t *pos)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = m->private;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
+ unsigned long val;
+
+ if (!meta)
+ return NULL;
+
+ if (*pos > meta->nr_subbufs)
+ return NULL;
+
+ val = *pos;
+ val++;
+
+ return (void *)val;
+}
+
+static void *rbm_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+
+ return rbm_start(m, pos);
+}
+
+static int rbm_show(struct seq_file *m, void *v)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = m->private;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
+ unsigned long val = (unsigned long)v;
+
+ if (val == 1) {
+ seq_printf(m, "head_buffer: %d\n",
+ rb_meta_subbuf_idx(meta, (void *)meta->head_buffer));
+ seq_printf(m, "commit_buffer: %d\n",
+ rb_meta_subbuf_idx(meta, (void *)meta->commit_buffer));
+ seq_printf(m, "subbuf_size: %d\n", meta->subbuf_size);
+ seq_printf(m, "nr_subbufs: %d\n", meta->nr_subbufs);
+ return 0;
+ }
+
+ val -= 2;
+ seq_printf(m, "buffer[%ld]: %d\n", val, meta->buffers[val]);
+
+ return 0;
+}
+
+static void rbm_stop(struct seq_file *m, void *p)
+{
+}
+
+static const struct seq_operations rb_meta_seq_ops = {
+ .start = rbm_start,
+ .next = rbm_next,
+ .show = rbm_show,
+ .stop = rbm_stop,
+};
+
+int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, int cpu)
+{
+ struct seq_file *m;
+ int ret;
+
+ ret = seq_open(file, &rb_meta_seq_ops);
+ if (ret)
+ return ret;
+
+ m = file->private_data;
+ m->private = buffer->buffers[cpu];
+
+ return 0;
+}
+
+/* Map the buffer_pages to the previous head and commit pages */
+static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *bpage)
+{
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
+
+ if (meta->head_buffer == (unsigned long)bpage->page)
+ cpu_buffer->head_page = bpage;
+
+ if (meta->commit_buffer == (unsigned long)bpage->page) {
+ cpu_buffer->commit_page = bpage;
+ cpu_buffer->tail_page = bpage;
}
}
static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
long nr_pages, struct list_head *pages)
{
+ struct trace_buffer *buffer = cpu_buffer->buffer;
+ struct ring_buffer_cpu_meta *meta = NULL;
struct buffer_page *bpage, *tmp;
bool user_thread = current->mm != NULL;
gfp_t mflags;
@@ -1511,6 +2138,10 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
*/
if (user_thread)
set_current_oom_origin();
+
+ if (buffer->range_addr_start)
+ meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu);
+
for (i = 0; i < nr_pages; i++) {
struct page *page;
@@ -1521,16 +2152,32 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
rb_check_bpage(cpu_buffer, bpage);
- list_add(&bpage->list, pages);
-
- page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
- mflags | __GFP_ZERO,
- cpu_buffer->buffer->subbuf_order);
- if (!page)
- goto free_pages;
- bpage->page = page_address(page);
+ /*
+ * Append the pages as for mapped buffers we want to keep
+ * the order
+ */
+ list_add_tail(&bpage->list, pages);
+
+ if (meta) {
+ /* A range was given. Use that for the buffer page */
+ bpage->page = rb_range_buffer(cpu_buffer, i + 1);
+ if (!bpage->page)
+ goto free_pages;
+ /* If this is valid from a previous boot */
+ if (meta->head_buffer)
+ rb_meta_buffer_update(cpu_buffer, bpage);
+ bpage->range = 1;
+ bpage->id = i + 1;
+ } else {
+ page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
+ mflags | __GFP_COMP | __GFP_ZERO,
+ cpu_buffer->buffer->subbuf_order);
+ if (!page)
+ goto free_pages;
+ bpage->page = page_address(page);
+ rb_init_page(bpage->page);
+ }
bpage->order = cpu_buffer->buffer->subbuf_order;
- rb_init_page(bpage->page);
if (user_thread && fatal_signal_pending(current))
goto free_pages;
@@ -1579,7 +2226,8 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
static struct ring_buffer_per_cpu *
rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
{
- struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_per_cpu *cpu_buffer __free(kfree) = NULL;
+ struct ring_buffer_cpu_meta *meta;
struct buffer_page *bpage;
struct page *page;
int ret;
@@ -1599,22 +2247,39 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
init_waitqueue_head(&cpu_buffer->irq_work.waiters);
init_waitqueue_head(&cpu_buffer->irq_work.full_waiters);
+ mutex_init(&cpu_buffer->mapping_lock);
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL, cpu_to_node(cpu));
if (!bpage)
- goto fail_free_buffer;
+ return NULL;
rb_check_bpage(cpu_buffer, bpage);
cpu_buffer->reader_page = bpage;
- page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_ZERO,
- cpu_buffer->buffer->subbuf_order);
- if (!page)
- goto fail_free_reader;
- bpage->page = page_address(page);
- rb_init_page(bpage->page);
+ if (buffer->range_addr_start) {
+ /*
+ * Range mapped buffers have the same restrictions as memory
+ * mapped ones do.
+ */
+ cpu_buffer->mapped = 1;
+ cpu_buffer->ring_meta = rb_range_meta(buffer, nr_pages, cpu);
+ bpage->page = rb_range_buffer(cpu_buffer, 0);
+ if (!bpage->page)
+ goto fail_free_reader;
+ if (cpu_buffer->ring_meta->head_buffer)
+ rb_meta_buffer_update(cpu_buffer, bpage);
+ bpage->range = 1;
+ } else {
+ page = alloc_pages_node(cpu_to_node(cpu),
+ GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
+ cpu_buffer->buffer->subbuf_order);
+ if (!page)
+ goto fail_free_reader;
+ bpage->page = page_address(page);
+ rb_init_page(bpage->page);
+ }
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
INIT_LIST_HEAD(&cpu_buffer->new_pages);
@@ -1623,19 +2288,41 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
if (ret < 0)
goto fail_free_reader;
- cpu_buffer->head_page
- = list_entry(cpu_buffer->pages, struct buffer_page, list);
- cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
+ rb_meta_validate_events(cpu_buffer);
+
+ /* If the boot meta was valid then this has already been updated */
+ meta = cpu_buffer->ring_meta;
+ if (!meta || !meta->head_buffer ||
+ !cpu_buffer->head_page || !cpu_buffer->commit_page || !cpu_buffer->tail_page) {
+ if (meta && meta->head_buffer &&
+ (cpu_buffer->head_page || cpu_buffer->commit_page || cpu_buffer->tail_page)) {
+ pr_warn("Ring buffer meta buffers not all mapped\n");
+ if (!cpu_buffer->head_page)
+ pr_warn(" Missing head_page\n");
+ if (!cpu_buffer->commit_page)
+ pr_warn(" Missing commit_page\n");
+ if (!cpu_buffer->tail_page)
+ pr_warn(" Missing tail_page\n");
+ }
- rb_head_page_activate(cpu_buffer);
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages, struct buffer_page, list);
+ cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
- return cpu_buffer;
+ rb_head_page_activate(cpu_buffer);
+
+ if (cpu_buffer->ring_meta)
+ meta->commit_buffer = meta->head_buffer;
+ } else {
+ /* The valid meta buffer still needs to activate the head page */
+ rb_head_page_activate(cpu_buffer);
+ }
+
+ return_ptr(cpu_buffer);
fail_free_reader:
free_buffer_page(cpu_buffer->reader_page);
- fail_free_buffer:
- kfree(cpu_buffer);
return NULL;
}
@@ -1664,22 +2351,15 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
kfree(cpu_buffer);
}
-/**
- * __ring_buffer_alloc - allocate a new ring_buffer
- * @size: the size in bytes per cpu that is needed.
- * @flags: attributes to set for the ring buffer.
- * @key: ring buffer reader_lock_key.
- *
- * Currently the only flag that is available is the RB_FL_OVERWRITE
- * flag. This flag means that the buffer will overwrite old data
- * when the buffer wraps. If this flag is not set, the buffer will
- * drop data when the tail hits the head.
- */
-struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
- struct lock_class_key *key)
+static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
+ int order, unsigned long start,
+ unsigned long end,
+ unsigned long scratch_size,
+ struct lock_class_key *key)
{
- struct trace_buffer *buffer;
+ struct trace_buffer *buffer __free(kfree) = NULL;
long nr_pages;
+ int subbuf_size;
int bsize;
int cpu;
int ret;
@@ -1691,16 +2371,15 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
return NULL;
if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
- goto fail_free_buffer;
+ return NULL;
- /* Default buffer page size - one system page */
- buffer->subbuf_order = 0;
- buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
+ buffer->subbuf_order = order;
+ subbuf_size = (PAGE_SIZE << order);
+ buffer->subbuf_size = subbuf_size - BUF_PAGE_HDR_SIZE;
/* Max payload is buffer page size - header (8bytes) */
buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2);
- nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
buffer->flags = flags;
buffer->clock = trace_clock_local;
buffer->reader_lock_key = key;
@@ -1708,10 +2387,6 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters);
init_waitqueue_head(&buffer->irq_work.waiters);
- /* need at least two pages */
- if (nr_pages < 2)
- nr_pages = 2;
-
buffer->cpus = nr_cpu_ids;
bsize = sizeof(void *) * nr_cpu_ids;
@@ -1720,6 +2395,69 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
if (!buffer->buffers)
goto fail_free_cpumask;
+ /* If start/end are specified, then that overrides size */
+ if (start && end) {
+ unsigned long buffers_start;
+ unsigned long ptr;
+ int n;
+
+ /* Make sure that start is word aligned */
+ start = ALIGN(start, sizeof(long));
+
+ /* scratch_size needs to be aligned too */
+ scratch_size = ALIGN(scratch_size, sizeof(long));
+
+ /* Subtract the buffer meta data and word aligned */
+ buffers_start = start + sizeof(struct ring_buffer_cpu_meta);
+ buffers_start = ALIGN(buffers_start, sizeof(long));
+ buffers_start += scratch_size;
+
+ /* Calculate the size for the per CPU data */
+ size = end - buffers_start;
+ size = size / nr_cpu_ids;
+
+ /*
+ * The number of sub-buffers (nr_pages) is determined by the
+ * total size allocated minus the meta data size.
+ * Then that is divided by the number of per CPU buffers
+ * needed, plus account for the integer array index that
+ * will be appended to the meta data.
+ */
+ nr_pages = (size - sizeof(struct ring_buffer_cpu_meta)) /
+ (subbuf_size + sizeof(int));
+ /* Need at least two pages plus the reader page */
+ if (nr_pages < 3)
+ goto fail_free_buffers;
+
+ again:
+ /* Make sure that the size fits aligned */
+ for (n = 0, ptr = buffers_start; n < nr_cpu_ids; n++) {
+ ptr += sizeof(struct ring_buffer_cpu_meta) +
+ sizeof(int) * nr_pages;
+ ptr = ALIGN(ptr, subbuf_size);
+ ptr += subbuf_size * nr_pages;
+ }
+ if (ptr > end) {
+ if (nr_pages <= 3)
+ goto fail_free_buffers;
+ nr_pages--;
+ goto again;
+ }
+
+ /* nr_pages should not count the reader page */
+ nr_pages--;
+ buffer->range_addr_start = start;
+ buffer->range_addr_end = end;
+
+ rb_range_meta_init(buffer, nr_pages, scratch_size);
+ } else {
+
+ /* need at least two pages */
+ nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
+ if (nr_pages < 2)
+ nr_pages = 2;
+ }
+
cpu = raw_smp_processor_id();
cpumask_set_cpu(cpu, buffer->cpumask);
buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
@@ -1732,7 +2470,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
mutex_init(&buffer->mutex);
- return buffer;
+ return_ptr(buffer);
fail_free_buffers:
for_each_buffer_cpu(buffer, cpu) {
@@ -1744,13 +2482,73 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
fail_free_cpumask:
free_cpumask_var(buffer->cpumask);
- fail_free_buffer:
- kfree(buffer);
return NULL;
}
+
+/**
+ * __ring_buffer_alloc - allocate a new ring_buffer
+ * @size: the size in bytes per cpu that is needed.
+ * @flags: attributes to set for the ring buffer.
+ * @key: ring buffer reader_lock_key.
+ *
+ * Currently the only flag that is available is the RB_FL_OVERWRITE
+ * flag. This flag means that the buffer will overwrite old data
+ * when the buffer wraps. If this flag is not set, the buffer will
+ * drop data when the tail hits the head.
+ */
+struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
+ struct lock_class_key *key)
+{
+ /* Default buffer page size - one system page */
+ return alloc_buffer(size, flags, 0, 0, 0, 0, key);
+
+}
EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
/**
+ * __ring_buffer_alloc_range - allocate a new ring_buffer from existing memory
+ * @size: the size in bytes per cpu that is needed.
+ * @flags: attributes to set for the ring buffer.
+ * @order: sub-buffer order
+ * @start: start of allocated range
+ * @range_size: size of allocated range
+ * @scratch_size: size of scratch area (for preallocated memory buffers)
+ * @key: ring buffer reader_lock_key.
+ *
+ * Currently the only flag that is available is the RB_FL_OVERWRITE
+ * flag. This flag means that the buffer will overwrite old data
+ * when the buffer wraps. If this flag is not set, the buffer will
+ * drop data when the tail hits the head.
+ */
+struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags,
+ int order, unsigned long start,
+ unsigned long range_size,
+ unsigned long scratch_size,
+ struct lock_class_key *key)
+{
+ return alloc_buffer(size, flags, order, start, start + range_size,
+ scratch_size, key);
+}
+
+void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size)
+{
+ struct ring_buffer_meta *meta;
+ void *ptr;
+
+ if (!buffer || !buffer->meta)
+ return NULL;
+
+ meta = buffer->meta;
+
+ ptr = (void *)ALIGN((unsigned long)meta + sizeof(*meta), sizeof(long));
+
+ if (size)
+ *size = (void *)meta + meta->buffers_offset - ptr;
+
+ return ptr;
+}
+
+/**
* ring_buffer_free - free a ring buffer.
* @buffer: the buffer to free.
*/
@@ -1789,8 +2587,6 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer)
return buffer->time_stamp_abs;
}
-static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
-
static inline unsigned long rb_page_entries(struct buffer_page *bpage)
{
return local_read(&bpage->entries) & RB_WRITE_MASK;
@@ -1859,6 +2655,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
/* make sure pages points to a valid page in the ring buffer */
cpu_buffer->pages = next_page;
+ cpu_buffer->cnt++;
/* update head page */
if (head_bit)
@@ -1965,6 +2762,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
* pointer to point to end of list
*/
head_page->prev = last_page;
+ cpu_buffer->cnt++;
success = true;
break;
}
@@ -2047,6 +2845,12 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
if (nr_pages < 2)
nr_pages = 2;
+ /*
+ * Keep CPUs from coming online while resizing to synchronize
+ * with new per CPU buffers being created.
+ */
+ guard(cpus_read_lock)();
+
/* prevent another thread from changing buffer sizes */
mutex_lock(&buffer->mutex);
atomic_inc(&buffer->resizing);
@@ -2091,7 +2895,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
cond_resched();
}
- cpus_read_lock();
/*
* Fire off all the required work handlers
* We can't schedule on offline CPUs, but it's not necessary
@@ -2131,7 +2934,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
cpu_buffer->nr_pages_to_update = 0;
}
- cpus_read_unlock();
} else {
cpu_buffer = buffer->buffers[cpu_id];
@@ -2159,8 +2961,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
goto out_err;
}
- cpus_read_lock();
-
/* Can't run something on an offline CPU. */
if (!cpu_online(cpu_id))
rb_update_pages(cpu_buffer);
@@ -2179,7 +2979,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
}
cpu_buffer->nr_pages_to_update = 0;
- cpus_read_unlock();
}
out:
@@ -2318,7 +3117,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
/* Size is determined by what has been committed */
static __always_inline unsigned rb_page_size(struct buffer_page *bpage)
{
- return rb_page_commit(bpage);
+ return rb_page_commit(bpage) & ~RB_MISSED_MASK;
}
static __always_inline unsigned
@@ -2357,6 +3156,52 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
iter->next_event = 0;
}
+/* Return the index into the sub-buffers for a given sub-buffer */
+static int rb_meta_subbuf_idx(struct ring_buffer_cpu_meta *meta, void *subbuf)
+{
+ void *subbuf_array;
+
+ subbuf_array = (void *)meta + sizeof(int) * meta->nr_subbufs;
+ subbuf_array = (void *)ALIGN((unsigned long)subbuf_array, meta->subbuf_size);
+ return (subbuf - subbuf_array) / meta->subbuf_size;
+}
+
+static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *next_page)
+{
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
+ unsigned long old_head = (unsigned long)next_page->page;
+ unsigned long new_head;
+
+ rb_inc_page(&next_page);
+ new_head = (unsigned long)next_page->page;
+
+ /*
+ * Only move it forward once, if something else came in and
+ * moved it forward, then we don't want to touch it.
+ */
+ (void)cmpxchg(&meta->head_buffer, old_head, new_head);
+}
+
+static void rb_update_meta_reader(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *reader)
+{
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
+ void *old_reader = cpu_buffer->reader_page->page;
+ void *new_reader = reader->page;
+ int id;
+
+ id = reader->id;
+ cpu_buffer->reader_page->id = id;
+ reader->id = 0;
+
+ meta->buffers[0] = rb_meta_subbuf_idx(meta, new_reader);
+ meta->buffers[id] = rb_meta_subbuf_idx(meta, old_reader);
+
+ /* The head pointer is the one after the reader */
+ rb_update_meta_head(cpu_buffer, reader);
+}
+
/*
* rb_handle_head_page - writer hit the head page
*
@@ -2406,6 +3251,8 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes);
local_inc(&cpu_buffer->pages_lost);
+ if (cpu_buffer->ring_meta)
+ rb_update_meta_head(cpu_buffer, next_page);
/*
* The entries will be zeroed out when we move the
* tail page.
@@ -2967,6 +3814,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
rb_inc_page(&cpu_buffer->commit_page);
+ if (cpu_buffer->ring_meta) {
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
+ meta->commit_buffer = (unsigned long)cpu_buffer->commit_page->page;
+ }
/* add barrier to keep gcc from optimizing too much */
barrier();
}
@@ -3281,7 +4132,7 @@ static const char *show_irq_str(int bits)
return type[bits];
}
-/* Assume this is an trace event */
+/* Assume this is a trace event */
static const char *show_flags(struct ring_buffer_event *event)
{
struct trace_entry *entry;
@@ -3413,11 +4264,10 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
struct rb_event_info *info,
unsigned long tail)
{
- struct ring_buffer_event *event;
struct buffer_data_page *bpage;
u64 ts, delta;
bool full = false;
- int e;
+ int ret;
bpage = info->tail_page->page;
@@ -3443,39 +4293,12 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
if (atomic_inc_return(this_cpu_ptr(&checking)) != 1)
goto out;
- ts = bpage->time_stamp;
-
- for (e = 0; e < tail; e += rb_event_length(event)) {
-
- event = (struct ring_buffer_event *)(bpage->data + e);
-
- switch (event->type_len) {
-
- case RINGBUF_TYPE_TIME_EXTEND:
- delta = rb_event_time_stamp(event);
- ts += delta;
- break;
-
- case RINGBUF_TYPE_TIME_STAMP:
- delta = rb_event_time_stamp(event);
- delta = rb_fix_abs_ts(delta, ts);
- if (delta < ts) {
- buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n",
- cpu_buffer->cpu, ts, delta);
- }
- ts = delta;
- break;
-
- case RINGBUF_TYPE_PADDING:
- if (event->time_delta == 1)
- break;
- fallthrough;
- case RINGBUF_TYPE_DATA:
- ts += event->time_delta;
- break;
-
- default:
- RB_WARN_ON(cpu_buffer, 1);
+ ret = rb_read_data_buffer(bpage, tail, cpu_buffer->cpu, &ts, &delta);
+ if (ret < 0) {
+ if (delta < ts) {
+ buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n",
+ cpu_buffer->cpu, ts, delta);
+ goto out;
}
}
if ((full && ts > info->ts) ||
@@ -3650,8 +4473,13 @@ rb_reserve_next_event(struct trace_buffer *buffer,
int nr_loops = 0;
int add_ts_default;
- /* ring buffer does cmpxchg, make sure it is safe in NMI context */
- if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) &&
+ /*
+ * ring buffer does cmpxchg as well as atomic64 operations
+ * (which some archs use locking for atomic64), make sure this
+ * is safe in NMI context
+ */
+ if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) ||
+ IS_ENABLED(CONFIG_GENERIC_ATOMIC64)) &&
(unlikely(in_nmi()))) {
return NULL;
}
@@ -3853,10 +4681,7 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer,
RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
rb_decrement_entry(cpu_buffer, event);
- if (rb_try_to_discard(cpu_buffer, event))
- goto out;
-
- out:
+ rb_try_to_discard(cpu_buffer, event);
rb_end_commit(cpu_buffer);
trace_recursive_unlock(cpu_buffer);
@@ -3934,40 +4759,22 @@ int ring_buffer_write(struct trace_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_write);
-static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+/*
+ * The total entries in the ring buffer is the running counter
+ * of entries entered into the ring buffer, minus the sum of
+ * the entries read from the ring buffer and the number of
+ * entries that were overwritten.
+ */
+static inline unsigned long
+rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
{
- struct buffer_page *reader = cpu_buffer->reader_page;
- struct buffer_page *head = rb_set_head_page(cpu_buffer);
- struct buffer_page *commit = cpu_buffer->commit_page;
-
- /* In case of error, head will be NULL */
- if (unlikely(!head))
- return true;
-
- /* Reader should exhaust content in reader page */
- if (reader->read != rb_page_commit(reader))
- return false;
-
- /*
- * If writers are committing on the reader page, knowing all
- * committed content has been read, the ring buffer is empty.
- */
- if (commit == reader)
- return true;
-
- /*
- * If writers are committing on a page other than reader page
- * and head page, there should always be content to read.
- */
- if (commit != head)
- return false;
+ return local_read(&cpu_buffer->entries) -
+ (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
+}
- /*
- * Writers are committing on the head page, we just need
- * to care about there're committed data, and the reader will
- * swap reader page with head page when it is to read data.
- */
- return rb_page_commit(commit) == 0;
+static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return !rb_num_of_entries(cpu_buffer);
}
/**
@@ -4072,6 +4879,24 @@ bool ring_buffer_record_is_set_on(struct trace_buffer *buffer)
}
/**
+ * ring_buffer_record_is_on_cpu - return true if the ring buffer can write
+ * @buffer: The ring buffer to see if write is enabled
+ * @cpu: The CPU to test if the ring buffer can write too
+ *
+ * Returns true if the ring buffer is in a state that it accepts writes
+ * for a particular CPU.
+ */
+bool ring_buffer_record_is_on_cpu(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ return ring_buffer_record_is_set_on(buffer) &&
+ !atomic_read(&cpu_buffer->record_disabled);
+}
+
+/**
* ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
* @buffer: The ring buffer to stop writes to.
* @cpu: The CPU buffer to stop
@@ -4113,19 +4938,6 @@ void ring_buffer_record_enable_cpu(struct trace_buffer *buffer, int cpu)
}
EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
-/*
- * The total entries in the ring buffer is the running counter
- * of entries entered into the ring buffer, minus the sum of
- * the entries read from the ring buffer and the number of
- * entries that were overwritten.
- */
-static inline unsigned long
-rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
-{
- return local_read(&cpu_buffer->entries) -
- (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
-}
-
/**
* ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer
* @buffer: The ring buffer
@@ -4416,7 +5228,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
return ((iter->head_page == commit_page && iter->head >= commit) ||
(iter->head_page == reader && commit_page == head_page &&
head_page->read == commit &&
- iter->head == rb_page_commit(cpu_buffer->reader_page)));
+ iter->head == rb_page_size(cpu_buffer->reader_page)));
}
EXPORT_SYMBOL_GPL(ring_buffer_iter_empty);
@@ -4573,7 +5385,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
* moving it. The page before the header page has the
* flag bit '1' set if it is pointing to the page we want.
* but if the writer is in the process of moving it
- * than it will be '2' or already moved '0'.
+ * then it will be '2' or already moved '0'.
*/
ret = rb_head_page_replace(reader, cpu_buffer->reader_page);
@@ -4584,6 +5396,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
if (!ret)
goto spin;
+ if (cpu_buffer->ring_meta)
+ rb_update_meta_reader(cpu_buffer, reader);
+
/*
* Yay! We succeeded in replacing the page.
*
@@ -4592,6 +5407,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
rb_inc_page(&cpu_buffer->head_page);
+ cpu_buffer->cnt++;
local_inc(&cpu_buffer->pages_read);
/* Finally update the reader page to the new head */
@@ -5036,13 +5852,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);
* @flags: gfp flags to use for memory allocation
*
* This performs the initial preparations necessary to iterate
- * through the buffer. Memory is allocated, buffer recording
+ * through the buffer. Memory is allocated, buffer resizing
* is disabled, and the iterator pointer is returned to the caller.
*
- * Disabling buffer recording prevents the reading from being
- * corrupted. This is not a consuming read, so a producer is not
- * expected.
- *
* After a sequence of ring_buffer_read_prepare calls, the user is
* expected to make at least one call to ring_buffer_read_prepare_sync.
* Afterwards, ring_buffer_read_start is invoked to get things going
@@ -5129,24 +5941,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_start);
* ring_buffer_read_finish - finish reading the iterator of the buffer
* @iter: The iterator retrieved by ring_buffer_start
*
- * This re-enables the recording to the buffer, and frees the
- * iterator.
+ * This re-enables resizing of the buffer, and frees the iterator.
*/
void
ring_buffer_read_finish(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
- unsigned long flags;
- /*
- * Ring buffer is disabled from recording, here's a good place
- * to check the integrity of the ring buffer.
- * Must prevent readers from trying to read, as the check
- * clears the HEAD page and readers require it.
- */
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ /* Use this opportunity to check the integrity of the ring buffer. */
rb_check_pages(cpu_buffer);
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
atomic_dec(&cpu_buffer->resize_disabled);
kfree(iter->event);
@@ -5211,6 +6014,60 @@ static void rb_clear_buffer_page(struct buffer_page *page)
page->read = 0;
}
+/*
+ * When the buffer is memory mapped to user space, each sub buffer
+ * has a unique id that is used by the meta data to tell the user
+ * where the current reader page is.
+ *
+ * For a normal allocated ring buffer, the id is saved in the buffer page
+ * id field, and updated via this function.
+ *
+ * But for a fixed memory mapped buffer, the id is already assigned for
+ * fixed memory ording in the memory layout and can not be used. Instead
+ * the index of where the page lies in the memory layout is used.
+ *
+ * For the normal pages, set the buffer page id with the passed in @id
+ * value and return that.
+ *
+ * For fixed memory mapped pages, get the page index in the memory layout
+ * and return that as the id.
+ */
+static int rb_page_id(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *bpage, int id)
+{
+ /*
+ * For boot buffers, the id is the index,
+ * otherwise, set the buffer page with this id
+ */
+ if (cpu_buffer->ring_meta)
+ id = rb_meta_subbuf_idx(cpu_buffer->ring_meta, bpage->page);
+ else
+ bpage->id = id;
+
+ return id;
+}
+
+static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct trace_buffer_meta *meta = cpu_buffer->meta_page;
+
+ if (!meta)
+ return;
+
+ meta->reader.read = cpu_buffer->reader_page->read;
+ meta->reader.id = rb_page_id(cpu_buffer, cpu_buffer->reader_page,
+ cpu_buffer->reader_page->id);
+
+ meta->reader.lost_events = cpu_buffer->lost_events;
+
+ meta->entries = local_read(&cpu_buffer->entries);
+ meta->overrun = local_read(&cpu_buffer->overrun);
+ meta->read = cpu_buffer->read;
+
+ /* Some archs do not have data cache coherency between kernel and user-space */
+ flush_kernel_vmap_range(cpu_buffer->meta_page, PAGE_SIZE);
+}
+
static void
rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
{
@@ -5257,26 +6114,29 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
rb_head_page_activate(cpu_buffer);
cpu_buffer->pages_removed = 0;
+
+ if (cpu_buffer->mapped) {
+ rb_update_meta_page(cpu_buffer);
+ if (cpu_buffer->ring_meta) {
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
+ meta->commit_buffer = meta->head_buffer;
+ }
+ }
}
/* Must have disabled the cpu buffer then done a synchronize_rcu */
static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
- goto out;
+ return;
arch_spin_lock(&cpu_buffer->lock);
rb_reset_cpu(cpu_buffer);
arch_spin_unlock(&cpu_buffer->lock);
-
- out:
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
}
/**
@@ -5464,31 +6324,33 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
!cpumask_test_cpu(cpu, buffer_b->cpumask))
- goto out;
+ return -EINVAL;
cpu_buffer_a = buffer_a->buffers[cpu];
cpu_buffer_b = buffer_b->buffers[cpu];
+ /* It's up to the callers to not try to swap mapped buffers */
+ if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped))
+ return -EBUSY;
+
/* At least make sure the two buffers are somewhat the same */
if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
- goto out;
+ return -EINVAL;
if (buffer_a->subbuf_order != buffer_b->subbuf_order)
- goto out;
-
- ret = -EAGAIN;
+ return -EINVAL;
if (atomic_read(&buffer_a->record_disabled))
- goto out;
+ return -EAGAIN;
if (atomic_read(&buffer_b->record_disabled))
- goto out;
+ return -EAGAIN;
if (atomic_read(&cpu_buffer_a->record_disabled))
- goto out;
+ return -EAGAIN;
if (atomic_read(&cpu_buffer_b->record_disabled))
- goto out;
+ return -EAGAIN;
/*
* We can't do a synchronize_rcu here because this
@@ -5525,7 +6387,6 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
out_dec:
atomic_dec(&cpu_buffer_a->record_disabled);
atomic_dec(&cpu_buffer_b->record_disabled);
-out:
return ret;
}
EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
@@ -5579,7 +6440,7 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
goto out;
page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_NORETRY | __GFP_ZERO,
+ GFP_KERNEL | __GFP_NORETRY | __GFP_COMP | __GFP_ZERO,
cpu_buffer->buffer->subbuf_order);
if (!page) {
kfree(bpage);
@@ -5684,43 +6545,42 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
struct buffer_data_page *bpage;
struct buffer_page *reader;
unsigned long missed_events;
- unsigned long flags;
unsigned int commit;
unsigned int read;
u64 save_timestamp;
- int ret = -1;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
- goto out;
+ return -1;
/*
* If len is not big enough to hold the page header, then
* we can not copy anything.
*/
if (len <= BUF_PAGE_HDR_SIZE)
- goto out;
+ return -1;
len -= BUF_PAGE_HDR_SIZE;
if (!data_page || !data_page->data)
- goto out;
+ return -1;
+
if (data_page->order != buffer->subbuf_order)
- goto out;
+ return -1;
bpage = data_page->data;
if (!bpage)
- goto out;
+ return -1;
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
reader = rb_get_reader_page(cpu_buffer);
if (!reader)
- goto out_unlock;
+ return -1;
event = rb_reader_event(cpu_buffer);
read = reader->read;
- commit = rb_page_commit(reader);
+ commit = rb_page_size(reader);
/* Check if any events were dropped */
missed_events = cpu_buffer->lost_events;
@@ -5733,7 +6593,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
* Otherwise, we can simply swap the page with the one passed in.
*/
if (read || (len < (commit - read)) ||
- cpu_buffer->reader_page == cpu_buffer->commit_page) {
+ cpu_buffer->reader_page == cpu_buffer->commit_page ||
+ cpu_buffer->mapped) {
struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
unsigned int rpos = read;
unsigned int pos = 0;
@@ -5748,7 +6609,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
if (full &&
(!read || (len < (commit - read)) ||
cpu_buffer->reader_page == cpu_buffer->commit_page))
- goto out_unlock;
+ return -1;
if (len > (commit - read))
len = (commit - read);
@@ -5757,7 +6618,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
size = rb_event_ts_length(event);
if (len < size)
- goto out_unlock;
+ return -1;
/* save the current timestamp, since the user will need it */
save_timestamp = cpu_buffer->read_stamp;
@@ -5796,7 +6657,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
} else {
/* update the entry counter */
cpu_buffer->read += rb_page_entries(reader);
- cpu_buffer->read_bytes += rb_page_commit(reader);
+ cpu_buffer->read_bytes += rb_page_size(reader);
/* swap the pages */
rb_init_page(bpage);
@@ -5815,7 +6676,6 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
if (reader->real_end)
local_set(&bpage->commit, reader->real_end);
}
- ret = read;
cpu_buffer->lost_events = 0;
@@ -5842,11 +6702,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
if (commit < buffer->subbuf_size)
memset(&bpage->data[commit], 0, buffer->subbuf_size - commit);
- out_unlock:
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
-
- out:
- return ret;
+ return read;
}
EXPORT_SYMBOL_GPL(ring_buffer_read_page);
@@ -5956,6 +6812,11 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
cpu_buffer = buffer->buffers[cpu];
+ if (cpu_buffer->mapped) {
+ err = -EBUSY;
+ goto error;
+ }
+
/* Update the number of pages to match the new size */
nr_pages = old_size * buffer->buffers[cpu]->nr_pages;
nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size);
@@ -5980,39 +6841,39 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
}
for_each_buffer_cpu(buffer, cpu) {
+ struct buffer_data_page *old_free_data_page;
+ struct list_head old_pages;
+ unsigned long flags;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
continue;
cpu_buffer = buffer->buffers[cpu];
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
/* Clear the head bit to make the link list normal to read */
rb_head_page_deactivate(cpu_buffer);
- /* Now walk the list and free all the old sub buffers */
- list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) {
- list_del_init(&bpage->list);
- free_buffer_page(bpage);
- }
- /* The above loop stopped an the last page needing to be freed */
- bpage = list_entry(cpu_buffer->pages, struct buffer_page, list);
- free_buffer_page(bpage);
-
- /* Free the current reader page */
- free_buffer_page(cpu_buffer->reader_page);
+ /*
+ * Collect buffers from the cpu_buffer pages list and the
+ * reader_page on old_pages, so they can be freed later when not
+ * under a spinlock. The pages list is a linked list with no
+ * head, adding old_pages turns it into a regular list with
+ * old_pages being the head.
+ */
+ list_add(&old_pages, cpu_buffer->pages);
+ list_add(&cpu_buffer->reader_page->list, &old_pages);
/* One page was allocated for the reader page */
cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next,
struct buffer_page, list);
list_del_init(&cpu_buffer->reader_page->list);
- /* The cpu_buffer pages are a link list with no head */
+ /* Install the new pages, remove the head from the list */
cpu_buffer->pages = cpu_buffer->new_pages.next;
- cpu_buffer->new_pages.next->prev = cpu_buffer->new_pages.prev;
- cpu_buffer->new_pages.prev->next = cpu_buffer->new_pages.next;
-
- /* Clear the new_pages list */
- INIT_LIST_HEAD(&cpu_buffer->new_pages);
+ list_del_init(&cpu_buffer->new_pages);
+ cpu_buffer->cnt++;
cpu_buffer->head_page
= list_entry(cpu_buffer->pages, struct buffer_page, list);
@@ -6021,11 +6882,20 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update;
cpu_buffer->nr_pages_to_update = 0;
- free_pages((unsigned long)cpu_buffer->free_page, old_order);
+ old_free_data_page = cpu_buffer->free_page;
cpu_buffer->free_page = NULL;
rb_head_page_activate(cpu_buffer);
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ /* Free old sub buffers */
+ list_for_each_entry_safe(bpage, tmp, &old_pages, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ free_pages((unsigned long)old_free_data_page, old_order);
+
rb_check_pages(cpu_buffer);
}
@@ -6057,6 +6927,440 @@ error:
}
EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set);
+static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct page *page;
+
+ if (cpu_buffer->meta_page)
+ return 0;
+
+ page = alloc_page(GFP_USER | __GFP_ZERO);
+ if (!page)
+ return -ENOMEM;
+
+ cpu_buffer->meta_page = page_to_virt(page);
+
+ return 0;
+}
+
+static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ unsigned long addr = (unsigned long)cpu_buffer->meta_page;
+
+ free_page(addr);
+ cpu_buffer->meta_page = NULL;
+}
+
+static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned long *subbuf_ids)
+{
+ struct trace_buffer_meta *meta = cpu_buffer->meta_page;
+ unsigned int nr_subbufs = cpu_buffer->nr_pages + 1;
+ struct buffer_page *first_subbuf, *subbuf;
+ int cnt = 0;
+ int id = 0;
+
+ id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, id);
+ subbuf_ids[id++] = (unsigned long)cpu_buffer->reader_page->page;
+ cnt++;
+
+ first_subbuf = subbuf = rb_set_head_page(cpu_buffer);
+ do {
+ id = rb_page_id(cpu_buffer, subbuf, id);
+
+ if (WARN_ON(id >= nr_subbufs))
+ break;
+
+ subbuf_ids[id] = (unsigned long)subbuf->page;
+
+ rb_inc_page(&subbuf);
+ id++;
+ cnt++;
+ } while (subbuf != first_subbuf);
+
+ WARN_ON(cnt != nr_subbufs);
+
+ /* install subbuf ID to kern VA translation */
+ cpu_buffer->subbuf_ids = subbuf_ids;
+
+ meta->meta_struct_len = sizeof(*meta);
+ meta->nr_subbufs = nr_subbufs;
+ meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+ meta->meta_page_size = meta->subbuf_size;
+
+ rb_update_meta_page(cpu_buffer);
+}
+
+static struct ring_buffer_per_cpu *
+rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return ERR_PTR(-EINVAL);
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ mutex_lock(&cpu_buffer->mapping_lock);
+
+ if (!cpu_buffer->user_mapped) {
+ mutex_unlock(&cpu_buffer->mapping_lock);
+ return ERR_PTR(-ENODEV);
+ }
+
+ return cpu_buffer;
+}
+
+static void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ mutex_unlock(&cpu_buffer->mapping_lock);
+}
+
+/*
+ * Fast-path for rb_buffer_(un)map(). Called whenever the meta-page doesn't need
+ * to be set-up or torn-down.
+ */
+static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer,
+ bool inc)
+{
+ unsigned long flags;
+
+ lockdep_assert_held(&cpu_buffer->mapping_lock);
+
+ /* mapped is always greater or equal to user_mapped */
+ if (WARN_ON(cpu_buffer->mapped < cpu_buffer->user_mapped))
+ return -EINVAL;
+
+ if (inc && cpu_buffer->mapped == UINT_MAX)
+ return -EBUSY;
+
+ if (WARN_ON(!inc && cpu_buffer->user_mapped == 0))
+ return -EINVAL;
+
+ mutex_lock(&cpu_buffer->buffer->mutex);
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+ if (inc) {
+ cpu_buffer->user_mapped++;
+ cpu_buffer->mapped++;
+ } else {
+ cpu_buffer->user_mapped--;
+ cpu_buffer->mapped--;
+ }
+
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ mutex_unlock(&cpu_buffer->buffer->mutex);
+
+ return 0;
+}
+
+/*
+ * +--------------+ pgoff == 0
+ * | meta page |
+ * +--------------+ pgoff == 1
+ * | subbuffer 0 |
+ * | |
+ * +--------------+ pgoff == (1 + (1 << subbuf_order))
+ * | subbuffer 1 |
+ * | |
+ * ...
+ */
+#ifdef CONFIG_MMU
+static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
+ struct vm_area_struct *vma)
+{
+ unsigned long nr_subbufs, nr_pages, nr_vma_pages, pgoff = vma->vm_pgoff;
+ unsigned int subbuf_pages, subbuf_order;
+ struct page **pages __free(kfree) = NULL;
+ int p = 0, s = 0;
+ int err;
+
+ /* Refuse MP_PRIVATE or writable mappings */
+ if (vma->vm_flags & VM_WRITE || vma->vm_flags & VM_EXEC ||
+ !(vma->vm_flags & VM_MAYSHARE))
+ return -EPERM;
+
+ subbuf_order = cpu_buffer->buffer->subbuf_order;
+ subbuf_pages = 1 << subbuf_order;
+
+ if (subbuf_order && pgoff % subbuf_pages)
+ return -EINVAL;
+
+ /*
+ * Make sure the mapping cannot become writable later. Also tell the VM
+ * to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND).
+ */
+ vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP,
+ VM_MAYWRITE);
+
+ lockdep_assert_held(&cpu_buffer->mapping_lock);
+
+ nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */
+ nr_pages = ((nr_subbufs + 1) << subbuf_order); /* + meta-page */
+ if (nr_pages <= pgoff)
+ return -EINVAL;
+
+ nr_pages -= pgoff;
+
+ nr_vma_pages = vma_pages(vma);
+ if (!nr_vma_pages || nr_vma_pages > nr_pages)
+ return -EINVAL;
+
+ nr_pages = nr_vma_pages;
+
+ pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ if (!pgoff) {
+ unsigned long meta_page_padding;
+
+ pages[p++] = virt_to_page(cpu_buffer->meta_page);
+
+ /*
+ * Pad with the zero-page to align the meta-page with the
+ * sub-buffers.
+ */
+ meta_page_padding = subbuf_pages - 1;
+ while (meta_page_padding-- && p < nr_pages) {
+ unsigned long __maybe_unused zero_addr =
+ vma->vm_start + (PAGE_SIZE * p);
+
+ pages[p++] = ZERO_PAGE(zero_addr);
+ }
+ } else {
+ /* Skip the meta-page */
+ pgoff -= subbuf_pages;
+
+ s += pgoff / subbuf_pages;
+ }
+
+ while (p < nr_pages) {
+ struct page *page;
+ int off = 0;
+
+ if (WARN_ON_ONCE(s >= nr_subbufs))
+ return -EINVAL;
+
+ page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]);
+
+ for (; off < (1 << (subbuf_order)); off++, page++) {
+ if (p >= nr_pages)
+ break;
+
+ pages[p++] = page;
+ }
+ s++;
+ }
+
+ err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
+
+ return err;
+}
+#else
+static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
+ struct vm_area_struct *vma)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+int ring_buffer_map(struct trace_buffer *buffer, int cpu,
+ struct vm_area_struct *vma)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags, *subbuf_ids;
+ int err;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -EINVAL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ guard(mutex)(&cpu_buffer->mapping_lock);
+
+ if (cpu_buffer->user_mapped) {
+ err = __rb_map_vma(cpu_buffer, vma);
+ if (!err)
+ err = __rb_inc_dec_mapped(cpu_buffer, true);
+ return err;
+ }
+
+ /* prevent another thread from changing buffer/sub-buffer sizes */
+ guard(mutex)(&buffer->mutex);
+
+ err = rb_alloc_meta_page(cpu_buffer);
+ if (err)
+ return err;
+
+ /* subbuf_ids include the reader while nr_pages does not */
+ subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL);
+ if (!subbuf_ids) {
+ rb_free_meta_page(cpu_buffer);
+ return -ENOMEM;
+ }
+
+ atomic_inc(&cpu_buffer->resize_disabled);
+
+ /*
+ * Lock all readers to block any subbuf swap until the subbuf IDs are
+ * assigned.
+ */
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ rb_setup_ids_meta_page(cpu_buffer, subbuf_ids);
+
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ err = __rb_map_vma(cpu_buffer, vma);
+ if (!err) {
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ /* This is the first time it is mapped by user */
+ cpu_buffer->mapped++;
+ cpu_buffer->user_mapped = 1;
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ } else {
+ kfree(cpu_buffer->subbuf_ids);
+ cpu_buffer->subbuf_ids = NULL;
+ rb_free_meta_page(cpu_buffer);
+ atomic_dec(&cpu_buffer->resize_disabled);
+ }
+
+ return 0;
+}
+
+int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -EINVAL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ guard(mutex)(&cpu_buffer->mapping_lock);
+
+ if (!cpu_buffer->user_mapped) {
+ return -ENODEV;
+ } else if (cpu_buffer->user_mapped > 1) {
+ __rb_inc_dec_mapped(cpu_buffer, false);
+ return 0;
+ }
+
+ guard(mutex)(&buffer->mutex);
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+ /* This is the last user space mapping */
+ if (!WARN_ON_ONCE(cpu_buffer->mapped < cpu_buffer->user_mapped))
+ cpu_buffer->mapped--;
+ cpu_buffer->user_mapped = 0;
+
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ kfree(cpu_buffer->subbuf_ids);
+ cpu_buffer->subbuf_ids = NULL;
+ rb_free_meta_page(cpu_buffer);
+ atomic_dec(&cpu_buffer->resize_disabled);
+
+ return 0;
+}
+
+int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_page *reader;
+ unsigned long missed_events;
+ unsigned long reader_size;
+ unsigned long flags;
+
+ cpu_buffer = rb_get_mapped_buffer(buffer, cpu);
+ if (IS_ERR(cpu_buffer))
+ return (int)PTR_ERR(cpu_buffer);
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+consume:
+ if (rb_per_cpu_empty(cpu_buffer))
+ goto out;
+
+ reader_size = rb_page_size(cpu_buffer->reader_page);
+
+ /*
+ * There are data to be read on the current reader page, we can
+ * return to the caller. But before that, we assume the latter will read
+ * everything. Let's update the kernel reader accordingly.
+ */
+ if (cpu_buffer->reader_page->read < reader_size) {
+ while (cpu_buffer->reader_page->read < reader_size)
+ rb_advance_reader(cpu_buffer);
+ goto out;
+ }
+
+ reader = rb_get_reader_page(cpu_buffer);
+ if (WARN_ON(!reader))
+ goto out;
+
+ /* Check if any events were dropped */
+ missed_events = cpu_buffer->lost_events;
+
+ if (missed_events) {
+ if (cpu_buffer->reader_page != cpu_buffer->commit_page) {
+ struct buffer_data_page *bpage = reader->page;
+ unsigned int commit;
+ /*
+ * Use the real_end for the data size,
+ * This gives us a chance to store the lost events
+ * on the page.
+ */
+ if (reader->real_end)
+ local_set(&bpage->commit, reader->real_end);
+ /*
+ * If there is room at the end of the page to save the
+ * missed events, then record it there.
+ */
+ commit = rb_page_size(reader);
+ if (buffer->subbuf_size - commit >= sizeof(missed_events)) {
+ memcpy(&bpage->data[commit], &missed_events,
+ sizeof(missed_events));
+ local_add(RB_MISSED_STORED, &bpage->commit);
+ }
+ local_add(RB_MISSED_EVENTS, &bpage->commit);
+ } else if (!WARN_ONCE(cpu_buffer->reader_page == cpu_buffer->tail_page,
+ "Reader on commit with %ld missed events",
+ missed_events)) {
+ /*
+ * There shouldn't be any missed events if the tail_page
+ * is on the reader page. But if the tail page is not on the
+ * reader page and the commit_page is, that would mean that
+ * there's a commit_overrun (an interrupt preempted an
+ * addition of an event and then filled the buffer
+ * with new events). In this case it's not an
+ * error, but it should still be reported.
+ *
+ * TODO: Add missed events to the page for user space to know.
+ */
+ pr_info("Ring buffer [%d] commit overrun lost %ld events at timestamp:%lld\n",
+ cpu, missed_events, cpu_buffer->reader_page->page->time_stamp);
+ }
+ }
+
+ cpu_buffer->lost_events = 0;
+
+ goto consume;
+
+out:
+ /* Some archs do not have data cache coherency between kernel and user-space */
+ flush_kernel_vmap_range(cpu_buffer->reader_page->page,
+ buffer->subbuf_size + BUF_PAGE_HDR_SIZE);
+
+ rb_update_meta_page(cpu_buffer);
+
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ rb_put_mapped_buffer(cpu_buffer);
+
+ return 0;
+}
+
/*
* We only allocate new buffers, never free them if the CPU goes down.
* If we were to free the buffer, then the user would lose any trace that was in
@@ -6180,9 +7484,9 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested)
/* Ignore dropped events before test starts. */
if (started) {
if (nested)
- data->bytes_dropped += len;
- else
data->bytes_dropped_nested += len;
+ else
+ data->bytes_dropped += len;
}
return len;
}
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 008187ebd7fe..cdc3aea12c93 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -307,14 +307,14 @@ static void ring_buffer_producer(void)
if (!disable_reader) {
if (consumer_fifo)
trace_printk("Running Consumer at SCHED_FIFO %s\n",
- consumer_fifo == 1 ? "low" : "high");
+ str_low_high(consumer_fifo == 1));
else
trace_printk("Running Consumer at nice: %d\n",
consumer_nice);
}
if (producer_fifo)
trace_printk("Running Producer at SCHED_FIFO %s\n",
- producer_fifo == 1 ? "low" : "high");
+ str_low_high(producer_fifo == 1));
else
trace_printk("Running Producer at nice: %d\n",
producer_nice);
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index 831779607e84..b39f36013ef2 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -25,30 +25,16 @@ menuconfig RV
For further information, see:
Documentation/trace/rv/runtime-verification.rst
-config RV_MON_WIP
- depends on RV
- depends on PREEMPT_TRACER
- select DA_MON_EVENTS_IMPLICIT
- bool "wip monitor"
- help
- Enable wip (wakeup in preemptive) sample monitor that illustrates
- the usage of per-cpu monitors, and one limitation of the
- preempt_disable/enable events.
-
- For further information, see:
- Documentation/trace/rv/monitor_wip.rst
-
-config RV_MON_WWNR
- depends on RV
- select DA_MON_EVENTS_ID
- bool "wwnr monitor"
- help
- Enable wwnr (wakeup while not running) sample monitor, this is a
- sample monitor that illustrates the usage of per-task monitor.
- The model is borken on purpose: it serves to test reactors.
-
- For further information, see:
- Documentation/trace/rv/monitor_wwnr.rst
+source "kernel/trace/rv/monitors/wip/Kconfig"
+source "kernel/trace/rv/monitors/wwnr/Kconfig"
+source "kernel/trace/rv/monitors/sched/Kconfig"
+source "kernel/trace/rv/monitors/tss/Kconfig"
+source "kernel/trace/rv/monitors/sco/Kconfig"
+source "kernel/trace/rv/monitors/snroc/Kconfig"
+source "kernel/trace/rv/monitors/scpd/Kconfig"
+source "kernel/trace/rv/monitors/snep/Kconfig"
+source "kernel/trace/rv/monitors/sncid/Kconfig"
+# Add new monitors here
config RV_REACTORS
bool "Runtime verification reactors"
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
index 963d14875b45..f9b2cd0483c3 100644
--- a/kernel/trace/rv/Makefile
+++ b/kernel/trace/rv/Makefile
@@ -1,8 +1,18 @@
# SPDX-License-Identifier: GPL-2.0
+ccflags-y += -I $(src) # needed for trace events
+
obj-$(CONFIG_RV) += rv.o
obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o
obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o
+obj-$(CONFIG_RV_MON_SCHED) += monitors/sched/sched.o
+obj-$(CONFIG_RV_MON_TSS) += monitors/tss/tss.o
+obj-$(CONFIG_RV_MON_SCO) += monitors/sco/sco.o
+obj-$(CONFIG_RV_MON_SNROC) += monitors/snroc/snroc.o
+obj-$(CONFIG_RV_MON_SCPD) += monitors/scpd/scpd.o
+obj-$(CONFIG_RV_MON_SNEP) += monitors/snep/snep.o
+obj-$(CONFIG_RV_MON_SNCID) += monitors/sncid/sncid.o
+# Add new monitors here
obj-$(CONFIG_RV_REACTORS) += rv_reactors.o
obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o
obj-$(CONFIG_RV_REACT_PANIC) += reactor_panic.o
diff --git a/kernel/trace/rv/monitors/sched/Kconfig b/kernel/trace/rv/monitors/sched/Kconfig
new file mode 100644
index 000000000000..ae3eb410abd7
--- /dev/null
+++ b/kernel/trace/rv/monitors/sched/Kconfig
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_SCHED
+ depends on RV
+ bool "sched monitor"
+ help
+ Collection of monitors to check the scheduler behaves according to specifications.
+ Enable this to enable all scheduler specification supported by the current kernel.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/sched/sched.c b/kernel/trace/rv/monitors/sched/sched.c
new file mode 100644
index 000000000000..905e03c3c934
--- /dev/null
+++ b/kernel/trace/rv/monitors/sched/sched.c
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+
+#define MODULE_NAME "sched"
+
+#include "sched.h"
+
+struct rv_monitor rv_sched;
+
+struct rv_monitor rv_sched = {
+ .name = "sched",
+ .description = "container for several scheduler monitor specifications.",
+ .enable = NULL,
+ .disable = NULL,
+ .reset = NULL,
+ .enabled = 0,
+};
+
+static int __init register_sched(void)
+{
+ rv_register_monitor(&rv_sched, NULL);
+ return 0;
+}
+
+static void __exit unregister_sched(void)
+{
+ rv_unregister_monitor(&rv_sched);
+}
+
+module_init(register_sched);
+module_exit(unregister_sched);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("sched: container for several scheduler monitor specifications.");
diff --git a/kernel/trace/rv/monitors/sched/sched.h b/kernel/trace/rv/monitors/sched/sched.h
new file mode 100644
index 000000000000..ba148dd8d48b
--- /dev/null
+++ b/kernel/trace/rv/monitors/sched/sched.h
@@ -0,0 +1,3 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+extern struct rv_monitor rv_sched;
diff --git a/kernel/trace/rv/monitors/sco/Kconfig b/kernel/trace/rv/monitors/sco/Kconfig
new file mode 100644
index 000000000000..097c96cccdd7
--- /dev/null
+++ b/kernel/trace/rv/monitors/sco/Kconfig
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_SCO
+ depends on RV
+ depends on RV_MON_SCHED
+ default y
+ select DA_MON_EVENTS_IMPLICIT
+ bool "sco monitor"
+ help
+ Monitor to ensure sched_set_state happens only in thread context.
+ This monitor is part of the sched monitors collection.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/sco/sco.c b/kernel/trace/rv/monitors/sco/sco.c
new file mode 100644
index 000000000000..4cff59220bfc
--- /dev/null
+++ b/kernel/trace/rv/monitors/sco/sco.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "sco"
+
+#include <trace/events/sched.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#include "sco.h"
+
+static struct rv_monitor rv_sco;
+DECLARE_DA_MON_PER_CPU(sco, unsigned char);
+
+static void handle_sched_set_state(void *data, struct task_struct *tsk, int state)
+{
+ da_handle_start_event_sco(sched_set_state_sco);
+}
+
+static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
+{
+ da_handle_event_sco(schedule_entry_sco);
+}
+
+static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
+{
+ da_handle_start_event_sco(schedule_exit_sco);
+}
+
+static int enable_sco(void)
+{
+ int retval;
+
+ retval = da_monitor_init_sco();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("sco", sched_set_state_tp, handle_sched_set_state);
+ rv_attach_trace_probe("sco", sched_entry_tp, handle_schedule_entry);
+ rv_attach_trace_probe("sco", sched_exit_tp, handle_schedule_exit);
+
+ return 0;
+}
+
+static void disable_sco(void)
+{
+ rv_sco.enabled = 0;
+
+ rv_detach_trace_probe("sco", sched_set_state_tp, handle_sched_set_state);
+ rv_detach_trace_probe("sco", sched_entry_tp, handle_schedule_entry);
+ rv_detach_trace_probe("sco", sched_exit_tp, handle_schedule_exit);
+
+ da_monitor_destroy_sco();
+}
+
+static struct rv_monitor rv_sco = {
+ .name = "sco",
+ .description = "scheduling context operations.",
+ .enable = enable_sco,
+ .disable = disable_sco,
+ .reset = da_monitor_reset_all_sco,
+ .enabled = 0,
+};
+
+static int __init register_sco(void)
+{
+ rv_register_monitor(&rv_sco, &rv_sched);
+ return 0;
+}
+
+static void __exit unregister_sco(void)
+{
+ rv_unregister_monitor(&rv_sco);
+}
+
+module_init(register_sco);
+module_exit(unregister_sco);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("sco: scheduling context operations.");
diff --git a/kernel/trace/rv/monitors/sco/sco.h b/kernel/trace/rv/monitors/sco/sco.h
new file mode 100644
index 000000000000..7a4c1f2d5ca1
--- /dev/null
+++ b/kernel/trace/rv/monitors/sco/sco.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of sco automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_sco {
+ thread_context_sco = 0,
+ scheduling_context_sco,
+ state_max_sco
+};
+
+#define INVALID_STATE state_max_sco
+
+enum events_sco {
+ sched_set_state_sco = 0,
+ schedule_entry_sco,
+ schedule_exit_sco,
+ event_max_sco
+};
+
+struct automaton_sco {
+ char *state_names[state_max_sco];
+ char *event_names[event_max_sco];
+ unsigned char function[state_max_sco][event_max_sco];
+ unsigned char initial_state;
+ bool final_states[state_max_sco];
+};
+
+static const struct automaton_sco automaton_sco = {
+ .state_names = {
+ "thread_context",
+ "scheduling_context"
+ },
+ .event_names = {
+ "sched_set_state",
+ "schedule_entry",
+ "schedule_exit"
+ },
+ .function = {
+ { thread_context_sco, scheduling_context_sco, INVALID_STATE },
+ { INVALID_STATE, INVALID_STATE, thread_context_sco },
+ },
+ .initial_state = thread_context_sco,
+ .final_states = { 1, 0 },
+};
diff --git a/kernel/trace/rv/monitors/sco/sco_trace.h b/kernel/trace/rv/monitors/sco/sco_trace.h
new file mode 100644
index 000000000000..b711cd9024ec
--- /dev/null
+++ b/kernel/trace/rv/monitors/sco/sco_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_SCO
+DEFINE_EVENT(event_da_monitor, event_sco,
+ TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor, error_sco,
+ TP_PROTO(char *state, char *event),
+ TP_ARGS(state, event));
+#endif /* CONFIG_RV_MON_SCO */
diff --git a/kernel/trace/rv/monitors/scpd/Kconfig b/kernel/trace/rv/monitors/scpd/Kconfig
new file mode 100644
index 000000000000..b9114fbf680f
--- /dev/null
+++ b/kernel/trace/rv/monitors/scpd/Kconfig
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_SCPD
+ depends on RV
+ depends on PREEMPT_TRACER
+ depends on RV_MON_SCHED
+ default y
+ select DA_MON_EVENTS_IMPLICIT
+ bool "scpd monitor"
+ help
+ Monitor to ensure schedule is called with preemption disabled.
+ This monitor is part of the sched monitors collection.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/scpd/scpd.c b/kernel/trace/rv/monitors/scpd/scpd.c
new file mode 100644
index 000000000000..cbdd6a5f8d7f
--- /dev/null
+++ b/kernel/trace/rv/monitors/scpd/scpd.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "scpd"
+
+#include <trace/events/sched.h>
+#include <trace/events/preemptirq.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#include "scpd.h"
+
+static struct rv_monitor rv_scpd;
+DECLARE_DA_MON_PER_CPU(scpd, unsigned char);
+
+static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_event_scpd(preempt_disable_scpd);
+}
+
+static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_start_event_scpd(preempt_enable_scpd);
+}
+
+static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
+{
+ da_handle_event_scpd(schedule_entry_scpd);
+}
+
+static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
+{
+ da_handle_event_scpd(schedule_exit_scpd);
+}
+
+static int enable_scpd(void)
+{
+ int retval;
+
+ retval = da_monitor_init_scpd();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("scpd", preempt_disable, handle_preempt_disable);
+ rv_attach_trace_probe("scpd", preempt_enable, handle_preempt_enable);
+ rv_attach_trace_probe("scpd", sched_entry_tp, handle_schedule_entry);
+ rv_attach_trace_probe("scpd", sched_exit_tp, handle_schedule_exit);
+
+ return 0;
+}
+
+static void disable_scpd(void)
+{
+ rv_scpd.enabled = 0;
+
+ rv_detach_trace_probe("scpd", preempt_disable, handle_preempt_disable);
+ rv_detach_trace_probe("scpd", preempt_enable, handle_preempt_enable);
+ rv_detach_trace_probe("scpd", sched_entry_tp, handle_schedule_entry);
+ rv_detach_trace_probe("scpd", sched_exit_tp, handle_schedule_exit);
+
+ da_monitor_destroy_scpd();
+}
+
+static struct rv_monitor rv_scpd = {
+ .name = "scpd",
+ .description = "schedule called with preemption disabled.",
+ .enable = enable_scpd,
+ .disable = disable_scpd,
+ .reset = da_monitor_reset_all_scpd,
+ .enabled = 0,
+};
+
+static int __init register_scpd(void)
+{
+ rv_register_monitor(&rv_scpd, &rv_sched);
+ return 0;
+}
+
+static void __exit unregister_scpd(void)
+{
+ rv_unregister_monitor(&rv_scpd);
+}
+
+module_init(register_scpd);
+module_exit(unregister_scpd);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("scpd: schedule called with preemption disabled.");
diff --git a/kernel/trace/rv/monitors/scpd/scpd.h b/kernel/trace/rv/monitors/scpd/scpd.h
new file mode 100644
index 000000000000..295f735a5811
--- /dev/null
+++ b/kernel/trace/rv/monitors/scpd/scpd.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of scpd automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_scpd {
+ cant_sched_scpd = 0,
+ can_sched_scpd,
+ state_max_scpd
+};
+
+#define INVALID_STATE state_max_scpd
+
+enum events_scpd {
+ preempt_disable_scpd = 0,
+ preempt_enable_scpd,
+ schedule_entry_scpd,
+ schedule_exit_scpd,
+ event_max_scpd
+};
+
+struct automaton_scpd {
+ char *state_names[state_max_scpd];
+ char *event_names[event_max_scpd];
+ unsigned char function[state_max_scpd][event_max_scpd];
+ unsigned char initial_state;
+ bool final_states[state_max_scpd];
+};
+
+static const struct automaton_scpd automaton_scpd = {
+ .state_names = {
+ "cant_sched",
+ "can_sched"
+ },
+ .event_names = {
+ "preempt_disable",
+ "preempt_enable",
+ "schedule_entry",
+ "schedule_exit"
+ },
+ .function = {
+ { can_sched_scpd, INVALID_STATE, INVALID_STATE, INVALID_STATE },
+ { INVALID_STATE, cant_sched_scpd, can_sched_scpd, can_sched_scpd },
+ },
+ .initial_state = cant_sched_scpd,
+ .final_states = { 1, 0 },
+};
diff --git a/kernel/trace/rv/monitors/scpd/scpd_trace.h b/kernel/trace/rv/monitors/scpd/scpd_trace.h
new file mode 100644
index 000000000000..6b0f4aa4732e
--- /dev/null
+++ b/kernel/trace/rv/monitors/scpd/scpd_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_SCPD
+DEFINE_EVENT(event_da_monitor, event_scpd,
+ TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor, error_scpd,
+ TP_PROTO(char *state, char *event),
+ TP_ARGS(state, event));
+#endif /* CONFIG_RV_MON_SCPD */
diff --git a/kernel/trace/rv/monitors/sncid/Kconfig b/kernel/trace/rv/monitors/sncid/Kconfig
new file mode 100644
index 000000000000..76bcfef4fd10
--- /dev/null
+++ b/kernel/trace/rv/monitors/sncid/Kconfig
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_SNCID
+ depends on RV
+ depends on IRQSOFF_TRACER
+ depends on RV_MON_SCHED
+ default y
+ select DA_MON_EVENTS_IMPLICIT
+ bool "sncid monitor"
+ help
+ Monitor to ensure schedule is not called with interrupt disabled.
+ This monitor is part of the sched monitors collection.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/sncid/sncid.c b/kernel/trace/rv/monitors/sncid/sncid.c
new file mode 100644
index 000000000000..f5037cd6214c
--- /dev/null
+++ b/kernel/trace/rv/monitors/sncid/sncid.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "sncid"
+
+#include <trace/events/sched.h>
+#include <trace/events/preemptirq.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#include "sncid.h"
+
+static struct rv_monitor rv_sncid;
+DECLARE_DA_MON_PER_CPU(sncid, unsigned char);
+
+static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_event_sncid(irq_disable_sncid);
+}
+
+static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_start_event_sncid(irq_enable_sncid);
+}
+
+static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
+{
+ da_handle_start_event_sncid(schedule_entry_sncid);
+}
+
+static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
+{
+ da_handle_start_event_sncid(schedule_exit_sncid);
+}
+
+static int enable_sncid(void)
+{
+ int retval;
+
+ retval = da_monitor_init_sncid();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("sncid", irq_disable, handle_irq_disable);
+ rv_attach_trace_probe("sncid", irq_enable, handle_irq_enable);
+ rv_attach_trace_probe("sncid", sched_entry_tp, handle_schedule_entry);
+ rv_attach_trace_probe("sncid", sched_exit_tp, handle_schedule_exit);
+
+ return 0;
+}
+
+static void disable_sncid(void)
+{
+ rv_sncid.enabled = 0;
+
+ rv_detach_trace_probe("sncid", irq_disable, handle_irq_disable);
+ rv_detach_trace_probe("sncid", irq_enable, handle_irq_enable);
+ rv_detach_trace_probe("sncid", sched_entry_tp, handle_schedule_entry);
+ rv_detach_trace_probe("sncid", sched_exit_tp, handle_schedule_exit);
+
+ da_monitor_destroy_sncid();
+}
+
+static struct rv_monitor rv_sncid = {
+ .name = "sncid",
+ .description = "schedule not called with interrupt disabled.",
+ .enable = enable_sncid,
+ .disable = disable_sncid,
+ .reset = da_monitor_reset_all_sncid,
+ .enabled = 0,
+};
+
+static int __init register_sncid(void)
+{
+ rv_register_monitor(&rv_sncid, &rv_sched);
+ return 0;
+}
+
+static void __exit unregister_sncid(void)
+{
+ rv_unregister_monitor(&rv_sncid);
+}
+
+module_init(register_sncid);
+module_exit(unregister_sncid);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("sncid: schedule not called with interrupt disabled.");
diff --git a/kernel/trace/rv/monitors/sncid/sncid.h b/kernel/trace/rv/monitors/sncid/sncid.h
new file mode 100644
index 000000000000..21304725142b
--- /dev/null
+++ b/kernel/trace/rv/monitors/sncid/sncid.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of sncid automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_sncid {
+ can_sched_sncid = 0,
+ cant_sched_sncid,
+ state_max_sncid
+};
+
+#define INVALID_STATE state_max_sncid
+
+enum events_sncid {
+ irq_disable_sncid = 0,
+ irq_enable_sncid,
+ schedule_entry_sncid,
+ schedule_exit_sncid,
+ event_max_sncid
+};
+
+struct automaton_sncid {
+ char *state_names[state_max_sncid];
+ char *event_names[event_max_sncid];
+ unsigned char function[state_max_sncid][event_max_sncid];
+ unsigned char initial_state;
+ bool final_states[state_max_sncid];
+};
+
+static const struct automaton_sncid automaton_sncid = {
+ .state_names = {
+ "can_sched",
+ "cant_sched"
+ },
+ .event_names = {
+ "irq_disable",
+ "irq_enable",
+ "schedule_entry",
+ "schedule_exit"
+ },
+ .function = {
+ { cant_sched_sncid, INVALID_STATE, can_sched_sncid, can_sched_sncid },
+ { INVALID_STATE, can_sched_sncid, INVALID_STATE, INVALID_STATE },
+ },
+ .initial_state = can_sched_sncid,
+ .final_states = { 1, 0 },
+};
diff --git a/kernel/trace/rv/monitors/sncid/sncid_trace.h b/kernel/trace/rv/monitors/sncid/sncid_trace.h
new file mode 100644
index 000000000000..3ce42a57671d
--- /dev/null
+++ b/kernel/trace/rv/monitors/sncid/sncid_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_SNCID
+DEFINE_EVENT(event_da_monitor, event_sncid,
+ TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor, error_sncid,
+ TP_PROTO(char *state, char *event),
+ TP_ARGS(state, event));
+#endif /* CONFIG_RV_MON_SNCID */
diff --git a/kernel/trace/rv/monitors/snep/Kconfig b/kernel/trace/rv/monitors/snep/Kconfig
new file mode 100644
index 000000000000..77527f971232
--- /dev/null
+++ b/kernel/trace/rv/monitors/snep/Kconfig
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_SNEP
+ depends on RV
+ depends on PREEMPT_TRACER
+ depends on RV_MON_SCHED
+ default y
+ select DA_MON_EVENTS_IMPLICIT
+ bool "snep monitor"
+ help
+ Monitor to ensure schedule does not enable preempt.
+ This monitor is part of the sched monitors collection.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/snep/snep.c b/kernel/trace/rv/monitors/snep/snep.c
new file mode 100644
index 000000000000..0076ba6d7ea4
--- /dev/null
+++ b/kernel/trace/rv/monitors/snep/snep.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "snep"
+
+#include <trace/events/sched.h>
+#include <trace/events/preemptirq.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#include "snep.h"
+
+static struct rv_monitor rv_snep;
+DECLARE_DA_MON_PER_CPU(snep, unsigned char);
+
+static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_start_event_snep(preempt_disable_snep);
+}
+
+static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_start_event_snep(preempt_enable_snep);
+}
+
+static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
+{
+ da_handle_event_snep(schedule_entry_snep);
+}
+
+static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
+{
+ da_handle_start_event_snep(schedule_exit_snep);
+}
+
+static int enable_snep(void)
+{
+ int retval;
+
+ retval = da_monitor_init_snep();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("snep", preempt_disable, handle_preempt_disable);
+ rv_attach_trace_probe("snep", preempt_enable, handle_preempt_enable);
+ rv_attach_trace_probe("snep", sched_entry_tp, handle_schedule_entry);
+ rv_attach_trace_probe("snep", sched_exit_tp, handle_schedule_exit);
+
+ return 0;
+}
+
+static void disable_snep(void)
+{
+ rv_snep.enabled = 0;
+
+ rv_detach_trace_probe("snep", preempt_disable, handle_preempt_disable);
+ rv_detach_trace_probe("snep", preempt_enable, handle_preempt_enable);
+ rv_detach_trace_probe("snep", sched_entry_tp, handle_schedule_entry);
+ rv_detach_trace_probe("snep", sched_exit_tp, handle_schedule_exit);
+
+ da_monitor_destroy_snep();
+}
+
+static struct rv_monitor rv_snep = {
+ .name = "snep",
+ .description = "schedule does not enable preempt.",
+ .enable = enable_snep,
+ .disable = disable_snep,
+ .reset = da_monitor_reset_all_snep,
+ .enabled = 0,
+};
+
+static int __init register_snep(void)
+{
+ rv_register_monitor(&rv_snep, &rv_sched);
+ return 0;
+}
+
+static void __exit unregister_snep(void)
+{
+ rv_unregister_monitor(&rv_snep);
+}
+
+module_init(register_snep);
+module_exit(unregister_snep);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("snep: schedule does not enable preempt.");
diff --git a/kernel/trace/rv/monitors/snep/snep.h b/kernel/trace/rv/monitors/snep/snep.h
new file mode 100644
index 000000000000..6d16b9ad931e
--- /dev/null
+++ b/kernel/trace/rv/monitors/snep/snep.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of snep automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_snep {
+ non_scheduling_context_snep = 0,
+ scheduling_contex_snep,
+ state_max_snep
+};
+
+#define INVALID_STATE state_max_snep
+
+enum events_snep {
+ preempt_disable_snep = 0,
+ preempt_enable_snep,
+ schedule_entry_snep,
+ schedule_exit_snep,
+ event_max_snep
+};
+
+struct automaton_snep {
+ char *state_names[state_max_snep];
+ char *event_names[event_max_snep];
+ unsigned char function[state_max_snep][event_max_snep];
+ unsigned char initial_state;
+ bool final_states[state_max_snep];
+};
+
+static const struct automaton_snep automaton_snep = {
+ .state_names = {
+ "non_scheduling_context",
+ "scheduling_contex"
+ },
+ .event_names = {
+ "preempt_disable",
+ "preempt_enable",
+ "schedule_entry",
+ "schedule_exit"
+ },
+ .function = {
+ { non_scheduling_context_snep, non_scheduling_context_snep, scheduling_contex_snep, INVALID_STATE },
+ { INVALID_STATE, INVALID_STATE, INVALID_STATE, non_scheduling_context_snep },
+ },
+ .initial_state = non_scheduling_context_snep,
+ .final_states = { 1, 0 },
+};
diff --git a/kernel/trace/rv/monitors/snep/snep_trace.h b/kernel/trace/rv/monitors/snep/snep_trace.h
new file mode 100644
index 000000000000..01aad49a949a
--- /dev/null
+++ b/kernel/trace/rv/monitors/snep/snep_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_SNEP
+DEFINE_EVENT(event_da_monitor, event_snep,
+ TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor, error_snep,
+ TP_PROTO(char *state, char *event),
+ TP_ARGS(state, event));
+#endif /* CONFIG_RV_MON_SNEP */
diff --git a/kernel/trace/rv/monitors/snroc/Kconfig b/kernel/trace/rv/monitors/snroc/Kconfig
new file mode 100644
index 000000000000..6e4365a2fea3
--- /dev/null
+++ b/kernel/trace/rv/monitors/snroc/Kconfig
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_SNROC
+ depends on RV
+ depends on RV_MON_SCHED
+ default y
+ select DA_MON_EVENTS_ID
+ bool "snroc monitor"
+ help
+ Monitor to ensure sched_set_state happens only in the respective task's context.
+ This monitor is part of the sched monitors collection.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/snroc/snroc.c b/kernel/trace/rv/monitors/snroc/snroc.c
new file mode 100644
index 000000000000..bb1f60d55296
--- /dev/null
+++ b/kernel/trace/rv/monitors/snroc/snroc.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "snroc"
+
+#include <trace/events/sched.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#include "snroc.h"
+
+static struct rv_monitor rv_snroc;
+DECLARE_DA_MON_PER_TASK(snroc, unsigned char);
+
+static void handle_sched_set_state(void *data, struct task_struct *tsk, int state)
+{
+ da_handle_event_snroc(tsk, sched_set_state_snroc);
+}
+
+static void handle_sched_switch(void *data, bool preempt,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
+{
+ da_handle_start_event_snroc(prev, sched_switch_out_snroc);
+ da_handle_event_snroc(next, sched_switch_in_snroc);
+}
+
+static int enable_snroc(void)
+{
+ int retval;
+
+ retval = da_monitor_init_snroc();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("snroc", sched_set_state_tp, handle_sched_set_state);
+ rv_attach_trace_probe("snroc", sched_switch, handle_sched_switch);
+
+ return 0;
+}
+
+static void disable_snroc(void)
+{
+ rv_snroc.enabled = 0;
+
+ rv_detach_trace_probe("snroc", sched_set_state_tp, handle_sched_set_state);
+ rv_detach_trace_probe("snroc", sched_switch, handle_sched_switch);
+
+ da_monitor_destroy_snroc();
+}
+
+static struct rv_monitor rv_snroc = {
+ .name = "snroc",
+ .description = "set non runnable on its own context.",
+ .enable = enable_snroc,
+ .disable = disable_snroc,
+ .reset = da_monitor_reset_all_snroc,
+ .enabled = 0,
+};
+
+static int __init register_snroc(void)
+{
+ rv_register_monitor(&rv_snroc, &rv_sched);
+ return 0;
+}
+
+static void __exit unregister_snroc(void)
+{
+ rv_unregister_monitor(&rv_snroc);
+}
+
+module_init(register_snroc);
+module_exit(unregister_snroc);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("snroc: set non runnable on its own context.");
diff --git a/kernel/trace/rv/monitors/snroc/snroc.h b/kernel/trace/rv/monitors/snroc/snroc.h
new file mode 100644
index 000000000000..c3650a2b1b10
--- /dev/null
+++ b/kernel/trace/rv/monitors/snroc/snroc.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of snroc automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_snroc {
+ other_context_snroc = 0,
+ own_context_snroc,
+ state_max_snroc
+};
+
+#define INVALID_STATE state_max_snroc
+
+enum events_snroc {
+ sched_set_state_snroc = 0,
+ sched_switch_in_snroc,
+ sched_switch_out_snroc,
+ event_max_snroc
+};
+
+struct automaton_snroc {
+ char *state_names[state_max_snroc];
+ char *event_names[event_max_snroc];
+ unsigned char function[state_max_snroc][event_max_snroc];
+ unsigned char initial_state;
+ bool final_states[state_max_snroc];
+};
+
+static const struct automaton_snroc automaton_snroc = {
+ .state_names = {
+ "other_context",
+ "own_context"
+ },
+ .event_names = {
+ "sched_set_state",
+ "sched_switch_in",
+ "sched_switch_out"
+ },
+ .function = {
+ { INVALID_STATE, own_context_snroc, INVALID_STATE },
+ { own_context_snroc, INVALID_STATE, other_context_snroc },
+ },
+ .initial_state = other_context_snroc,
+ .final_states = { 1, 0 },
+};
diff --git a/kernel/trace/rv/monitors/snroc/snroc_trace.h b/kernel/trace/rv/monitors/snroc/snroc_trace.h
new file mode 100644
index 000000000000..50114cef5122
--- /dev/null
+++ b/kernel/trace/rv/monitors/snroc/snroc_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_SNROC
+DEFINE_EVENT(event_da_monitor_id, event_snroc,
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(id, state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor_id, error_snroc,
+ TP_PROTO(int id, char *state, char *event),
+ TP_ARGS(id, state, event));
+#endif /* CONFIG_RV_MON_SNROC */
diff --git a/kernel/trace/rv/monitors/tss/Kconfig b/kernel/trace/rv/monitors/tss/Kconfig
new file mode 100644
index 000000000000..479f86f52e60
--- /dev/null
+++ b/kernel/trace/rv/monitors/tss/Kconfig
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_TSS
+ depends on RV
+ depends on RV_MON_SCHED
+ default y
+ select DA_MON_EVENTS_IMPLICIT
+ bool "tss monitor"
+ help
+ Monitor to ensure sched_switch happens only in scheduling context.
+ This monitor is part of the sched monitors collection.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/tss/tss.c b/kernel/trace/rv/monitors/tss/tss.c
new file mode 100644
index 000000000000..542787e6524f
--- /dev/null
+++ b/kernel/trace/rv/monitors/tss/tss.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "tss"
+
+#include <trace/events/sched.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#include "tss.h"
+
+static struct rv_monitor rv_tss;
+DECLARE_DA_MON_PER_CPU(tss, unsigned char);
+
+static void handle_sched_switch(void *data, bool preempt,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
+{
+ da_handle_event_tss(sched_switch_tss);
+}
+
+static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
+{
+ da_handle_event_tss(schedule_entry_tss);
+}
+
+static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
+{
+ da_handle_start_event_tss(schedule_exit_tss);
+}
+
+static int enable_tss(void)
+{
+ int retval;
+
+ retval = da_monitor_init_tss();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("tss", sched_switch, handle_sched_switch);
+ rv_attach_trace_probe("tss", sched_entry_tp, handle_schedule_entry);
+ rv_attach_trace_probe("tss", sched_exit_tp, handle_schedule_exit);
+
+ return 0;
+}
+
+static void disable_tss(void)
+{
+ rv_tss.enabled = 0;
+
+ rv_detach_trace_probe("tss", sched_switch, handle_sched_switch);
+ rv_detach_trace_probe("tss", sched_entry_tp, handle_schedule_entry);
+ rv_detach_trace_probe("tss", sched_exit_tp, handle_schedule_exit);
+
+ da_monitor_destroy_tss();
+}
+
+static struct rv_monitor rv_tss = {
+ .name = "tss",
+ .description = "task switch while scheduling.",
+ .enable = enable_tss,
+ .disable = disable_tss,
+ .reset = da_monitor_reset_all_tss,
+ .enabled = 0,
+};
+
+static int __init register_tss(void)
+{
+ rv_register_monitor(&rv_tss, &rv_sched);
+ return 0;
+}
+
+static void __exit unregister_tss(void)
+{
+ rv_unregister_monitor(&rv_tss);
+}
+
+module_init(register_tss);
+module_exit(unregister_tss);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("tss: task switch while scheduling.");
diff --git a/kernel/trace/rv/monitors/tss/tss.h b/kernel/trace/rv/monitors/tss/tss.h
new file mode 100644
index 000000000000..f0a36fda1b87
--- /dev/null
+++ b/kernel/trace/rv/monitors/tss/tss.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of tss automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_tss {
+ thread_tss = 0,
+ sched_tss,
+ state_max_tss
+};
+
+#define INVALID_STATE state_max_tss
+
+enum events_tss {
+ sched_switch_tss = 0,
+ schedule_entry_tss,
+ schedule_exit_tss,
+ event_max_tss
+};
+
+struct automaton_tss {
+ char *state_names[state_max_tss];
+ char *event_names[event_max_tss];
+ unsigned char function[state_max_tss][event_max_tss];
+ unsigned char initial_state;
+ bool final_states[state_max_tss];
+};
+
+static const struct automaton_tss automaton_tss = {
+ .state_names = {
+ "thread",
+ "sched"
+ },
+ .event_names = {
+ "sched_switch",
+ "schedule_entry",
+ "schedule_exit"
+ },
+ .function = {
+ { INVALID_STATE, sched_tss, INVALID_STATE },
+ { sched_tss, INVALID_STATE, thread_tss },
+ },
+ .initial_state = thread_tss,
+ .final_states = { 1, 0 },
+};
diff --git a/kernel/trace/rv/monitors/tss/tss_trace.h b/kernel/trace/rv/monitors/tss/tss_trace.h
new file mode 100644
index 000000000000..4619dbb50cc0
--- /dev/null
+++ b/kernel/trace/rv/monitors/tss/tss_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_TSS
+DEFINE_EVENT(event_da_monitor, event_tss,
+ TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor, error_tss,
+ TP_PROTO(char *state, char *event),
+ TP_ARGS(state, event));
+#endif /* CONFIG_RV_MON_TSS */
diff --git a/kernel/trace/rv/monitors/wip/Kconfig b/kernel/trace/rv/monitors/wip/Kconfig
new file mode 100644
index 000000000000..e464b9294865
--- /dev/null
+++ b/kernel/trace/rv/monitors/wip/Kconfig
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_WIP
+ depends on RV
+ depends on PREEMPT_TRACER
+ select DA_MON_EVENTS_IMPLICIT
+ bool "wip monitor"
+ help
+ Enable wip (wakeup in preemptive) sample monitor that illustrates
+ the usage of per-cpu monitors, and one limitation of the
+ preempt_disable/enable events.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_wip.rst
diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c
index b2b49a27e886..ed758fec8608 100644
--- a/kernel/trace/rv/monitors/wip/wip.c
+++ b/kernel/trace/rv/monitors/wip/wip.c
@@ -10,7 +10,7 @@
#define MODULE_NAME "wip"
-#include <trace/events/rv.h>
+#include <rv_trace.h>
#include <trace/events/sched.h>
#include <trace/events/preemptirq.h>
@@ -71,7 +71,7 @@ static struct rv_monitor rv_wip = {
static int __init register_wip(void)
{
- rv_register_monitor(&rv_wip);
+ rv_register_monitor(&rv_wip, NULL);
return 0;
}
diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h
index 2e373f2c65ed..c7193748bf36 100644
--- a/kernel/trace/rv/monitors/wip/wip.h
+++ b/kernel/trace/rv/monitors/wip/wip.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Automatically generated C representation of wip automaton
* For further information about this format, see kernel documentation:
diff --git a/kernel/trace/rv/monitors/wip/wip_trace.h b/kernel/trace/rv/monitors/wip/wip_trace.h
new file mode 100644
index 000000000000..aa2162f47a4c
--- /dev/null
+++ b/kernel/trace/rv/monitors/wip/wip_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_WIP
+DEFINE_EVENT(event_da_monitor, event_wip,
+ TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor, error_wip,
+ TP_PROTO(char *state, char *event),
+ TP_ARGS(state, event));
+#endif /* CONFIG_RV_MON_WIP */
diff --git a/kernel/trace/rv/monitors/wwnr/Kconfig b/kernel/trace/rv/monitors/wwnr/Kconfig
new file mode 100644
index 000000000000..d3bfc20037db
--- /dev/null
+++ b/kernel/trace/rv/monitors/wwnr/Kconfig
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_WWNR
+ depends on RV
+ select DA_MON_EVENTS_ID
+ bool "wwnr monitor"
+ help
+ Enable wwnr (wakeup while not running) sample monitor, this is a
+ sample monitor that illustrates the usage of per-task monitor.
+ The model is borken on purpose: it serves to test reactors.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_wwnr.rst
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c
index 0e43dd2db685..172f31c4b0f3 100644
--- a/kernel/trace/rv/monitors/wwnr/wwnr.c
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.c
@@ -10,7 +10,7 @@
#define MODULE_NAME "wwnr"
-#include <trace/events/rv.h>
+#include <rv_trace.h>
#include <trace/events/sched.h>
#include "wwnr.h"
@@ -70,7 +70,7 @@ static struct rv_monitor rv_wwnr = {
static int __init register_wwnr(void)
{
- rv_register_monitor(&rv_wwnr);
+ rv_register_monitor(&rv_wwnr, NULL);
return 0;
}
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h
index d0d9c4b8121b..0a59d23edf61 100644
--- a/kernel/trace/rv/monitors/wwnr/wwnr.h
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Automatically generated C representation of wwnr automaton
* For further information about this format, see kernel documentation:
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr_trace.h b/kernel/trace/rv/monitors/wwnr/wwnr_trace.h
new file mode 100644
index 000000000000..fc97ec7476ad
--- /dev/null
+++ b/kernel/trace/rv/monitors/wwnr/wwnr_trace.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_WWNR
+/* id is the pid of the task */
+DEFINE_EVENT(event_da_monitor_id, event_wwnr,
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(id, state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor_id, error_wwnr,
+ TP_PROTO(int id, char *state, char *event),
+ TP_ARGS(id, state, event));
+#endif /* CONFIG_RV_MON_WWNR */
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index 2f68e93fff0b..e4077500a91d 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -41,7 +41,7 @@
* per-task monitor, and so on), and the helper functions that glue the
* monitor to the system via trace. Generally, a monitor includes some form
* of trace output as a reaction for event parsing and exceptions,
- * as depicted bellow:
+ * as depicted below:
*
* Linux +----- RV Monitor ----------------------------------+ Formal
* Realm | | Realm
@@ -145,7 +145,7 @@
#ifdef CONFIG_DA_MON_EVENTS
#define CREATE_TRACE_POINTS
-#include <trace/events/rv.h>
+#include <rv_trace.h>
#endif
#include "rv.h"
@@ -162,7 +162,7 @@ struct dentry *get_monitors_root(void)
/*
* Interface for the monitor register.
*/
-static LIST_HEAD(rv_monitors_list);
+LIST_HEAD(rv_monitors_list);
static int task_monitor_count;
static bool task_monitor_slots[RV_PER_TASK_MONITORS];
@@ -207,6 +207,35 @@ void rv_put_task_monitor_slot(int slot)
}
/*
+ * Monitors with a parent are nested,
+ * Monitors without a parent could be standalone or containers.
+ */
+bool rv_is_nested_monitor(struct rv_monitor_def *mdef)
+{
+ return mdef->parent != NULL;
+}
+
+/*
+ * We set our list to have nested monitors listed after their parent
+ * if a monitor has a child element its a container.
+ * Containers can be also identified based on their function pointers:
+ * as they are not real monitors they do not need function definitions
+ * for enable()/disable(). Use this condition to find empty containers.
+ * Keep both conditions in case we have some non-compliant containers.
+ */
+bool rv_is_container_monitor(struct rv_monitor_def *mdef)
+{
+ struct rv_monitor_def *next;
+
+ if (list_is_last(&mdef->list, &rv_monitors_list))
+ return false;
+
+ next = list_next_entry(mdef, list);
+
+ return next->parent == mdef->monitor || !mdef->monitor->enable;
+}
+
+/*
* This section collects the monitor/ files and folders.
*/
static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf, size_t count,
@@ -229,7 +258,8 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync)
if (mdef->monitor->enabled) {
mdef->monitor->enabled = 0;
- mdef->monitor->disable();
+ if (mdef->monitor->disable)
+ mdef->monitor->disable();
/*
* Wait for the execution of all events to finish.
@@ -243,19 +273,79 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync)
return 0;
}
+static void rv_disable_single(struct rv_monitor_def *mdef)
+{
+ __rv_disable_monitor(mdef, true);
+}
+
+static int rv_enable_single(struct rv_monitor_def *mdef)
+{
+ int retval;
+
+ lockdep_assert_held(&rv_interface_lock);
+
+ if (mdef->monitor->enabled)
+ return 0;
+
+ retval = mdef->monitor->enable();
+
+ if (!retval)
+ mdef->monitor->enabled = 1;
+
+ return retval;
+}
+
+static void rv_disable_container(struct rv_monitor_def *mdef)
+{
+ struct rv_monitor_def *p = mdef;
+ int enabled = 0;
+
+ list_for_each_entry_continue(p, &rv_monitors_list, list) {
+ if (p->parent != mdef->monitor)
+ break;
+ enabled += __rv_disable_monitor(p, false);
+ }
+ if (enabled)
+ tracepoint_synchronize_unregister();
+ mdef->monitor->enabled = 0;
+}
+
+static int rv_enable_container(struct rv_monitor_def *mdef)
+{
+ struct rv_monitor_def *p = mdef;
+ int retval = 0;
+
+ list_for_each_entry_continue(p, &rv_monitors_list, list) {
+ if (retval || p->parent != mdef->monitor)
+ break;
+ retval = rv_enable_single(p);
+ }
+ if (retval)
+ rv_disable_container(mdef);
+ else
+ mdef->monitor->enabled = 1;
+ return retval;
+}
+
/**
* rv_disable_monitor - disable a given runtime monitor
+ * @mdef: Pointer to the monitor definition structure.
*
* Returns 0 on success.
*/
int rv_disable_monitor(struct rv_monitor_def *mdef)
{
- __rv_disable_monitor(mdef, true);
+ if (rv_is_container_monitor(mdef))
+ rv_disable_container(mdef);
+ else
+ rv_disable_single(mdef);
+
return 0;
}
/**
* rv_enable_monitor - enable a given runtime monitor
+ * @mdef: Pointer to the monitor definition structure.
*
* Returns 0 on success, error otherwise.
*/
@@ -263,15 +353,10 @@ int rv_enable_monitor(struct rv_monitor_def *mdef)
{
int retval;
- lockdep_assert_held(&rv_interface_lock);
-
- if (mdef->monitor->enabled)
- return 0;
-
- retval = mdef->monitor->enable();
-
- if (!retval)
- mdef->monitor->enabled = 1;
+ if (rv_is_container_monitor(mdef))
+ retval = rv_enable_container(mdef);
+ else
+ retval = rv_enable_single(mdef);
return retval;
}
@@ -304,7 +389,6 @@ static ssize_t monitor_enable_write_data(struct file *filp, const char __user *u
static const struct file_operations interface_enable_fops = {
.open = simple_open,
- .llseek = no_llseek,
.write = monitor_enable_write_data,
.read = monitor_enable_read_data,
};
@@ -327,7 +411,6 @@ static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf,
static const struct file_operations interface_desc_fops = {
.open = simple_open,
- .llseek = no_llseek,
.read = monitor_desc_read_data,
};
@@ -336,9 +419,9 @@ static const struct file_operations interface_desc_fops = {
* the monitor dir, where the specific options of the monitor
* are exposed.
*/
-static int create_monitor_dir(struct rv_monitor_def *mdef)
+static int create_monitor_dir(struct rv_monitor_def *mdef, struct rv_monitor_def *parent)
{
- struct dentry *root = get_monitors_root();
+ struct dentry *root = parent ? parent->root_d : get_monitors_root();
const char *name = mdef->monitor->name;
struct dentry *tmp;
int retval;
@@ -377,7 +460,11 @@ static int monitors_show(struct seq_file *m, void *p)
{
struct rv_monitor_def *mon_def = p;
- seq_printf(m, "%s\n", mon_def->monitor->name);
+ if (mon_def->parent)
+ seq_printf(m, "%s:%s\n", mon_def->parent->name,
+ mon_def->monitor->name);
+ else
+ seq_printf(m, "%s\n", mon_def->monitor->name);
return 0;
}
@@ -514,7 +601,7 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user
struct rv_monitor_def *mdef;
int retval = -EINVAL;
bool enable = true;
- char *ptr;
+ char *ptr, *tmp;
int len;
if (count < 1 || count > MAX_RV_MONITOR_NAME_SIZE + 1)
@@ -541,6 +628,11 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user
retval = -EINVAL;
+ /* we support 1 nesting level, trim the parent */
+ tmp = strstr(ptr, ":");
+ if (tmp)
+ ptr = tmp+1;
+
list_for_each_entry(mdef, &rv_monitors_list, list) {
if (strcmp(ptr, mdef->monitor->name) != 0)
continue;
@@ -613,7 +705,7 @@ static void reset_all_monitors(void)
struct rv_monitor_def *mdef;
list_for_each_entry(mdef, &rv_monitors_list, list) {
- if (mdef->monitor->enabled)
+ if (mdef->monitor->enabled && mdef->monitor->reset)
mdef->monitor->reset();
}
}
@@ -672,7 +764,6 @@ static ssize_t monitoring_on_write_data(struct file *filp, const char __user *us
static const struct file_operations monitoring_on_fops = {
.open = simple_open,
- .llseek = no_llseek,
.write = monitoring_on_write_data,
.read = monitoring_on_read_data,
};
@@ -686,18 +777,19 @@ static void destroy_monitor_dir(struct rv_monitor_def *mdef)
/**
* rv_register_monitor - register a rv monitor.
* @monitor: The rv_monitor to be registered.
+ * @parent: The parent of the monitor to be registered, NULL if not nested.
*
* Returns 0 if successful, error otherwise.
*/
-int rv_register_monitor(struct rv_monitor *monitor)
+int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent)
{
- struct rv_monitor_def *r;
+ struct rv_monitor_def *r, *p = NULL;
int retval = 0;
if (strlen(monitor->name) >= MAX_RV_MONITOR_NAME_SIZE) {
pr_info("Monitor %s has a name longer than %d\n", monitor->name,
MAX_RV_MONITOR_NAME_SIZE);
- return -1;
+ return -EINVAL;
}
mutex_lock(&rv_interface_lock);
@@ -705,11 +797,27 @@ int rv_register_monitor(struct rv_monitor *monitor)
list_for_each_entry(r, &rv_monitors_list, list) {
if (strcmp(monitor->name, r->monitor->name) == 0) {
pr_info("Monitor %s is already registered\n", monitor->name);
- retval = -1;
+ retval = -EEXIST;
goto out_unlock;
}
}
+ if (parent) {
+ list_for_each_entry(r, &rv_monitors_list, list) {
+ if (strcmp(parent->name, r->monitor->name) == 0) {
+ p = r;
+ break;
+ }
+ }
+ }
+
+ if (p && rv_is_nested_monitor(p)) {
+ pr_info("Parent monitor %s is already nested, cannot nest further\n",
+ parent->name);
+ retval = -EINVAL;
+ goto out_unlock;
+ }
+
r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL);
if (!r) {
retval = -ENOMEM;
@@ -717,14 +825,19 @@ int rv_register_monitor(struct rv_monitor *monitor)
}
r->monitor = monitor;
+ r->parent = parent;
- retval = create_monitor_dir(r);
+ retval = create_monitor_dir(r, p);
if (retval) {
kfree(r);
goto out_unlock;
}
- list_add_tail(&r->list, &rv_monitors_list);
+ /* keep children close to the parent for easier visualisation */
+ if (p)
+ list_add(&r->list, &p->list);
+ else
+ list_add_tail(&r->list, &rv_monitors_list);
out_unlock:
mutex_unlock(&rv_interface_lock);
diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h
index db6cb0913dbd..98fca0a1adbc 100644
--- a/kernel/trace/rv/rv.h
+++ b/kernel/trace/rv/rv.h
@@ -21,6 +21,7 @@ struct rv_interface {
#define MAX_RV_REACTOR_NAME_SIZE 32
extern struct mutex rv_interface_lock;
+extern struct list_head rv_monitors_list;
#ifdef CONFIG_RV_REACTORS
struct rv_reactor_def {
@@ -34,6 +35,7 @@ struct rv_reactor_def {
struct rv_monitor_def {
struct list_head list;
struct rv_monitor *monitor;
+ struct rv_monitor *parent;
struct dentry *root_d;
#ifdef CONFIG_RV_REACTORS
struct rv_reactor_def *rdef;
@@ -45,6 +47,8 @@ struct rv_monitor_def {
struct dentry *get_monitors_root(void);
int rv_disable_monitor(struct rv_monitor_def *mdef);
int rv_enable_monitor(struct rv_monitor_def *mdef);
+bool rv_is_container_monitor(struct rv_monitor_def *mdef);
+bool rv_is_nested_monitor(struct rv_monitor_def *mdef);
#ifdef CONFIG_RV_REACTORS
int reactor_populate_monitor(struct rv_monitor_def *mdef);
diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c
index 6aae106695b6..9501ca886d83 100644
--- a/kernel/trace/rv/rv_reactors.c
+++ b/kernel/trace/rv/rv_reactors.c
@@ -158,8 +158,9 @@ static const struct seq_operations monitor_reactors_seq_ops = {
.show = monitor_reactor_show
};
-static void monitor_swap_reactors(struct rv_monitor_def *mdef, struct rv_reactor_def *rdef,
- bool reacting)
+static void monitor_swap_reactors_single(struct rv_monitor_def *mdef,
+ struct rv_reactor_def *rdef,
+ bool reacting, bool nested)
{
bool monitor_enabled;
@@ -179,10 +180,31 @@ static void monitor_swap_reactors(struct rv_monitor_def *mdef, struct rv_reactor
mdef->reacting = reacting;
mdef->monitor->react = rdef->reactor->react;
- if (monitor_enabled)
+ /* enable only once if iterating through a container */
+ if (monitor_enabled && !nested)
rv_enable_monitor(mdef);
}
+static void monitor_swap_reactors(struct rv_monitor_def *mdef,
+ struct rv_reactor_def *rdef, bool reacting)
+{
+ struct rv_monitor_def *p = mdef;
+
+ if (rv_is_container_monitor(mdef))
+ list_for_each_entry_continue(p, &rv_monitors_list, list) {
+ if (p->parent != mdef->monitor)
+ break;
+ monitor_swap_reactors_single(p, rdef, reacting, true);
+ }
+ /*
+ * This call enables and disables the monitor if they were active.
+ * In case of a container, we already disabled all and will enable all.
+ * All nested monitors are enabled also if they were off, we may refine
+ * this logic in the future.
+ */
+ monitor_swap_reactors_single(mdef, rdef, reacting, false);
+}
+
static ssize_t
monitor_reactors_write(struct file *file, const char __user *user_buf,
size_t count, loff_t *ppos)
@@ -426,7 +448,6 @@ static ssize_t reacting_on_write_data(struct file *filp, const char __user *user
static const struct file_operations reacting_on_fops = {
.open = simple_open,
- .llseek = no_llseek,
.write = reacting_on_write_data,
.read = reacting_on_read_data,
};
diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h
new file mode 100644
index 000000000000..422b75f58891
--- /dev/null
+++ b/kernel/trace/rv/rv_trace.h
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rv
+
+#if !defined(_TRACE_RV_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RV_H
+
+#include <linux/rv.h>
+#include <linux/tracepoint.h>
+
+#ifdef CONFIG_DA_MON_EVENTS_IMPLICIT
+DECLARE_EVENT_CLASS(event_da_monitor,
+
+ TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+
+ TP_ARGS(state, event, next_state, final_state),
+
+ TP_STRUCT__entry(
+ __array( char, state, MAX_DA_NAME_LEN )
+ __array( char, event, MAX_DA_NAME_LEN )
+ __array( char, next_state, MAX_DA_NAME_LEN )
+ __field( bool, final_state )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->state, state, MAX_DA_NAME_LEN);
+ memcpy(__entry->event, event, MAX_DA_NAME_LEN);
+ memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN);
+ __entry->final_state = final_state;
+ ),
+
+ TP_printk("%s x %s -> %s %s",
+ __entry->state,
+ __entry->event,
+ __entry->next_state,
+ __entry->final_state ? "(final)" : "")
+);
+
+DECLARE_EVENT_CLASS(error_da_monitor,
+
+ TP_PROTO(char *state, char *event),
+
+ TP_ARGS(state, event),
+
+ TP_STRUCT__entry(
+ __array( char, state, MAX_DA_NAME_LEN )
+ __array( char, event, MAX_DA_NAME_LEN )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->state, state, MAX_DA_NAME_LEN);
+ memcpy(__entry->event, event, MAX_DA_NAME_LEN);
+ ),
+
+ TP_printk("event %s not expected in the state %s",
+ __entry->event,
+ __entry->state)
+);
+
+#include <monitors/wip/wip_trace.h>
+#include <monitors/tss/tss_trace.h>
+#include <monitors/sco/sco_trace.h>
+#include <monitors/scpd/scpd_trace.h>
+#include <monitors/snep/snep_trace.h>
+#include <monitors/sncid/sncid_trace.h>
+// Add new monitors based on CONFIG_DA_MON_EVENTS_IMPLICIT here
+
+#endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */
+
+#ifdef CONFIG_DA_MON_EVENTS_ID
+DECLARE_EVENT_CLASS(event_da_monitor_id,
+
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+
+ TP_ARGS(id, state, event, next_state, final_state),
+
+ TP_STRUCT__entry(
+ __field( int, id )
+ __array( char, state, MAX_DA_NAME_LEN )
+ __array( char, event, MAX_DA_NAME_LEN )
+ __array( char, next_state, MAX_DA_NAME_LEN )
+ __field( bool, final_state )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->state, state, MAX_DA_NAME_LEN);
+ memcpy(__entry->event, event, MAX_DA_NAME_LEN);
+ memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN);
+ __entry->id = id;
+ __entry->final_state = final_state;
+ ),
+
+ TP_printk("%d: %s x %s -> %s %s",
+ __entry->id,
+ __entry->state,
+ __entry->event,
+ __entry->next_state,
+ __entry->final_state ? "(final)" : "")
+);
+
+DECLARE_EVENT_CLASS(error_da_monitor_id,
+
+ TP_PROTO(int id, char *state, char *event),
+
+ TP_ARGS(id, state, event),
+
+ TP_STRUCT__entry(
+ __field( int, id )
+ __array( char, state, MAX_DA_NAME_LEN )
+ __array( char, event, MAX_DA_NAME_LEN )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->state, state, MAX_DA_NAME_LEN);
+ memcpy(__entry->event, event, MAX_DA_NAME_LEN);
+ __entry->id = id;
+ ),
+
+ TP_printk("%d: event %s not expected in the state %s",
+ __entry->id,
+ __entry->event,
+ __entry->state)
+);
+
+#include <monitors/wwnr/wwnr_trace.h>
+#include <monitors/snroc/snroc_trace.h>
+// Add new monitors based on CONFIG_DA_MON_EVENTS_ID here
+
+#endif /* CONFIG_DA_MON_EVENTS_ID */
+#endif /* _TRACE_RV_H */
+
+/* This part ust be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE rv_trace
+#include <trace/define_trace.h>
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 233d1af39fff..95ae7c4e5835 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -26,6 +26,7 @@
#include <linux/hardirq.h>
#include <linux/linkage.h>
#include <linux/uaccess.h>
+#include <linux/cleanup.h>
#include <linux/vmalloc.h>
#include <linux/ftrace.h>
#include <linux/module.h>
@@ -48,6 +49,9 @@
#include <linux/fsnotify.h>
#include <linux/irq_work.h>
#include <linux/workqueue.h>
+#include <linux/sort.h>
+#include <linux/io.h> /* vmap_page_range() */
+#include <linux/fs_context.h>
#include <asm/setup.h> /* COMMAND_LINE_SIZE */
@@ -86,6 +90,7 @@ void __init disable_tracing_selftest(const char *reason)
static struct trace_iterator *tracepoint_print_iter;
int tracepoint_printk;
static bool tracepoint_printk_stop_on_boot __initdata;
+static bool traceoff_after_boot __initdata;
static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key);
/* For tracers that don't implement custom flags */
@@ -116,6 +121,7 @@ static int tracing_disabled = 1;
cpumask_var_t __read_mostly tracing_buffer_mask;
+#define MAX_TRACER_SIZE 100
/*
* ftrace_dump_on_oops - variable to dump ftrace buffer on oops
*
@@ -138,7 +144,40 @@ cpumask_var_t __read_mostly tracing_buffer_mask;
char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0";
/* When set, tracing will stop when a WARN*() is hit */
-int __disable_trace_on_warning;
+static int __disable_trace_on_warning;
+
+int tracepoint_printk_sysctl(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos);
+static const struct ctl_table trace_sysctl_table[] = {
+ {
+ .procname = "ftrace_dump_on_oops",
+ .data = &ftrace_dump_on_oops,
+ .maxlen = MAX_TRACER_SIZE,
+ .mode = 0644,
+ .proc_handler = proc_dostring,
+ },
+ {
+ .procname = "traceoff_on_warning",
+ .data = &__disable_trace_on_warning,
+ .maxlen = sizeof(__disable_trace_on_warning),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tracepoint_printk",
+ .data = &tracepoint_printk,
+ .maxlen = sizeof(tracepoint_printk),
+ .mode = 0644,
+ .proc_handler = tracepoint_printk_sysctl,
+ },
+};
+
+static int __init init_trace_sysctls(void)
+{
+ register_sysctl_init("kernel", trace_sysctl_table);
+ return 0;
+}
+subsys_initcall(init_trace_sysctls);
#ifdef CONFIG_TRACE_EVAL_MAP_FILE
/* Map of enums to their values, for "eval_map" file */
@@ -329,6 +368,13 @@ static int __init set_tracepoint_printk_stop(char *str)
}
__setup("tp_printk_stop_on_boot", set_tracepoint_printk_stop);
+static int __init set_traceoff_after_boot(char *str)
+{
+ traceoff_after_boot = true;
+ return 1;
+}
+__setup("traceoff_after_boot", set_traceoff_after_boot);
+
unsigned long long ns2usecs(u64 nsec)
{
nsec += 500;
@@ -482,7 +528,8 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export);
TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \
TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \
TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \
- TRACE_ITER_HASH_PTR)
+ TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK | \
+ TRACE_ITER_COPY_MARKER)
/* trace_options that are only supported by global_trace */
#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \
@@ -490,7 +537,8 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export);
/* trace_flags that are default zero for instances */
#define ZEROED_TRACE_FLAGS \
- (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK)
+ (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK | \
+ TRACE_ITER_COPY_MARKER)
/*
* The global_trace is the descriptor that holds the top-level tracing
@@ -500,6 +548,54 @@ static struct trace_array global_trace = {
.trace_flags = TRACE_DEFAULT_FLAGS,
};
+static struct trace_array *printk_trace = &global_trace;
+
+/* List of trace_arrays interested in the top level trace_marker */
+static LIST_HEAD(marker_copies);
+
+static __always_inline bool printk_binsafe(struct trace_array *tr)
+{
+ /*
+ * The binary format of traceprintk can cause a crash if used
+ * by a buffer from another boot. Force the use of the
+ * non binary version of trace_printk if the trace_printk
+ * buffer is a boot mapped ring buffer.
+ */
+ return !(tr->flags & TRACE_ARRAY_FL_BOOT);
+}
+
+static void update_printk_trace(struct trace_array *tr)
+{
+ if (printk_trace == tr)
+ return;
+
+ printk_trace->trace_flags &= ~TRACE_ITER_TRACE_PRINTK;
+ printk_trace = tr;
+ tr->trace_flags |= TRACE_ITER_TRACE_PRINTK;
+}
+
+/* Returns true if the status of tr changed */
+static bool update_marker_trace(struct trace_array *tr, int enabled)
+{
+ lockdep_assert_held(&event_mutex);
+
+ if (enabled) {
+ if (!list_empty(&tr->marker_list))
+ return false;
+
+ list_add_rcu(&tr->marker_list, &marker_copies);
+ tr->trace_flags |= TRACE_ITER_COPY_MARKER;
+ return true;
+ }
+
+ if (list_empty(&tr->marker_list))
+ return false;
+
+ list_del_init(&tr->marker_list);
+ tr->trace_flags &= ~TRACE_ITER_COPY_MARKER;
+ return true;
+}
+
void trace_set_ring_buffer_expanded(struct trace_array *tr)
{
if (!tr)
@@ -512,19 +608,16 @@ LIST_HEAD(ftrace_trace_arrays);
int trace_array_get(struct trace_array *this_tr)
{
struct trace_array *tr;
- int ret = -ENODEV;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
if (tr == this_tr) {
tr->ref++;
- ret = 0;
- break;
+ return 0;
}
}
- mutex_unlock(&trace_types_lock);
- return ret;
+ return -ENODEV;
}
static void __trace_array_put(struct trace_array *this_tr)
@@ -570,19 +663,6 @@ int tracing_check_open_get_tr(struct trace_array *tr)
return 0;
}
-int call_filter_check_discard(struct trace_event_call *call, void *rec,
- struct trace_buffer *buffer,
- struct ring_buffer_event *event)
-{
- if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
- !filter_match_preds(call->filter, rec)) {
- __trace_event_discard_commit(buffer, event);
- return 1;
- }
-
- return 0;
-}
-
/**
* trace_find_filtered_pid - check if a pid exists in a filtered_pid list
* @filtered_pids: The list of pids to check
@@ -965,7 +1045,8 @@ static inline void trace_access_lock_init(void)
#endif
#ifdef CONFIG_STACKTRACE
-static void __ftrace_trace_stack(struct trace_buffer *buffer,
+static void __ftrace_trace_stack(struct trace_array *tr,
+ struct trace_buffer *buffer,
unsigned int trace_ctx,
int skip, struct pt_regs *regs);
static inline void ftrace_trace_stack(struct trace_array *tr,
@@ -974,7 +1055,8 @@ static inline void ftrace_trace_stack(struct trace_array *tr,
int skip, struct pt_regs *regs);
#else
-static inline void __ftrace_trace_stack(struct trace_buffer *buffer,
+static inline void __ftrace_trace_stack(struct trace_array *tr,
+ struct trace_buffer *buffer,
unsigned int trace_ctx,
int skip, struct pt_regs *regs)
{
@@ -1117,7 +1199,7 @@ EXPORT_SYMBOL_GPL(__trace_array_puts);
*/
int __trace_puts(unsigned long ip, const char *str, int size)
{
- return __trace_array_puts(&global_trace, ip, str, size);
+ return __trace_array_puts(printk_trace, ip, str, size);
}
EXPORT_SYMBOL_GPL(__trace_puts);
@@ -1128,6 +1210,7 @@ EXPORT_SYMBOL_GPL(__trace_puts);
*/
int __trace_bputs(unsigned long ip, const char *str)
{
+ struct trace_array *tr = READ_ONCE(printk_trace);
struct ring_buffer_event *event;
struct trace_buffer *buffer;
struct bputs_entry *entry;
@@ -1135,14 +1218,17 @@ int __trace_bputs(unsigned long ip, const char *str)
int size = sizeof(struct bputs_entry);
int ret = 0;
- if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
+ if (!printk_binsafe(tr))
+ return __trace_puts(ip, str, strlen(str));
+
+ if (!(tr->trace_flags & TRACE_ITER_PRINTK))
return 0;
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
trace_ctx = tracing_gen_ctx();
- buffer = global_trace.array_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
ring_buffer_nest_start(buffer);
event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
@@ -1155,7 +1241,7 @@ int __trace_bputs(unsigned long ip, const char *str)
entry->str = str;
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL);
+ ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL);
ret = 1;
out:
@@ -1191,6 +1277,12 @@ static void tracing_snapshot_instance_cond(struct trace_array *tr,
return;
}
+ if (tr->mapped) {
+ trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n");
+ trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
+ return;
+ }
+
local_irq_save(flags);
update_max_tr(tr, current, smp_processor_id(), cond_data);
local_irq_restore(flags);
@@ -1323,7 +1415,7 @@ static int tracing_arm_snapshot_locked(struct trace_array *tr)
lockdep_assert_held(&trace_types_lock);
spin_lock(&tr->snapshot_trigger_lock);
- if (tr->snapshot == UINT_MAX) {
+ if (tr->snapshot == UINT_MAX || tr->mapped) {
spin_unlock(&tr->snapshot_trigger_lock);
return -EBUSY;
}
@@ -1421,22 +1513,20 @@ EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
cond_update_fn_t update)
{
- struct cond_snapshot *cond_snapshot;
- int ret = 0;
+ struct cond_snapshot *cond_snapshot __free(kfree) =
+ kzalloc(sizeof(*cond_snapshot), GFP_KERNEL);
+ int ret;
- cond_snapshot = kzalloc(sizeof(*cond_snapshot), GFP_KERNEL);
if (!cond_snapshot)
return -ENOMEM;
cond_snapshot->cond_data = cond_data;
cond_snapshot->update = update;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
- if (tr->current_trace->use_max_tr) {
- ret = -EBUSY;
- goto fail_unlock;
- }
+ if (tr->current_trace->use_max_tr)
+ return -EBUSY;
/*
* The cond_snapshot can only change to NULL without the
@@ -1446,29 +1536,20 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
* do safely with only holding the trace_types_lock and not
* having to take the max_lock.
*/
- if (tr->cond_snapshot) {
- ret = -EBUSY;
- goto fail_unlock;
- }
+ if (tr->cond_snapshot)
+ return -EBUSY;
ret = tracing_arm_snapshot_locked(tr);
if (ret)
- goto fail_unlock;
+ return ret;
local_irq_disable();
arch_spin_lock(&tr->max_lock);
- tr->cond_snapshot = cond_snapshot;
+ tr->cond_snapshot = no_free_ptr(cond_snapshot);
arch_spin_unlock(&tr->max_lock);
local_irq_enable();
- mutex_unlock(&trace_types_lock);
-
- return ret;
-
- fail_unlock:
- mutex_unlock(&trace_types_lock);
- kfree(cond_snapshot);
- return ret;
+ return 0;
}
EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable);
@@ -1564,6 +1645,39 @@ void tracer_tracing_off(struct trace_array *tr)
}
/**
+ * tracer_tracing_disable() - temporary disable the buffer from write
+ * @tr: The trace array to disable its buffer for
+ *
+ * Expects trace_tracing_enable() to re-enable tracing.
+ * The difference between this and tracer_tracing_off() is that this
+ * is a counter and can nest, whereas, tracer_tracing_off() can
+ * be called multiple times and a single trace_tracing_on() will
+ * enable it.
+ */
+void tracer_tracing_disable(struct trace_array *tr)
+{
+ if (WARN_ON_ONCE(!tr->array_buffer.buffer))
+ return;
+
+ ring_buffer_record_disable(tr->array_buffer.buffer);
+}
+
+/**
+ * tracer_tracing_enable() - counter part of tracer_tracing_disable()
+ * @tr: The trace array that had tracer_tracincg_disable() called on it
+ *
+ * This is called after tracer_tracing_disable() has been called on @tr,
+ * when it's safe to re-enable tracing.
+ */
+void tracer_tracing_enable(struct trace_array *tr)
+{
+ if (WARN_ON_ONCE(!tr->array_buffer.buffer))
+ return;
+
+ ring_buffer_record_enable(tr->array_buffer.buffer);
+}
+
+/**
* tracing_off - turn off tracing buffers
*
* This function stops the tracing buffers from recording data.
@@ -1901,7 +2015,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
max_data->critical_start = data->critical_start;
max_data->critical_end = data->critical_end;
- strncpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
+ strscpy(max_data->comm, tsk->comm);
max_data->pid = tsk->pid;
/*
* If tsk == current, then use current_uid(), as that does not use
@@ -2181,10 +2295,10 @@ static __init int init_trace_selftests(void)
selftests_can_run = true;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
if (list_empty(&postponed_selftests))
- goto out;
+ return 0;
pr_info("Running postponed tracer tests:\n");
@@ -2213,17 +2327,10 @@ static __init int init_trace_selftests(void)
}
tracing_selftest_running = false;
- out:
- mutex_unlock(&trace_types_lock);
-
return 0;
}
core_initcall(init_trace_selftests);
#else
-static inline int run_tracer_selftest(struct tracer *type)
-{
- return 0;
-}
static inline int do_run_tracer_selftest(struct tracer *type)
{
return 0;
@@ -2357,6 +2464,25 @@ void tracing_reset_online_cpus(struct array_buffer *buf)
ring_buffer_record_enable(buffer);
}
+static void tracing_reset_all_cpus(struct array_buffer *buf)
+{
+ struct trace_buffer *buffer = buf->buffer;
+
+ if (!buffer)
+ return;
+
+ ring_buffer_record_disable(buffer);
+
+ /* Make sure all commits have finished */
+ synchronize_rcu();
+
+ buf->time_start = buffer_ftrace_now(buf, buf->cpu);
+
+ ring_buffer_reset(buffer);
+
+ ring_buffer_record_enable(buffer);
+}
+
/* Must have trace_types_lock held */
void tracing_reset_all_online_cpus_unlocked(void)
{
@@ -2515,6 +2641,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
trace_flags |= TRACE_FLAG_NEED_RESCHED;
if (test_preempt_need_resched())
trace_flags |= TRACE_FLAG_PREEMPT_RESCHED;
+ if (IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY) && tif_test_bit(TIF_NEED_RESCHED_LAZY))
+ trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY;
return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) |
(min_t(unsigned int, migration_disable_value(), 0xf)) << 4;
}
@@ -2761,14 +2889,14 @@ static void output_printk(struct trace_event_buffer *fbuffer)
raw_spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
}
-int tracepoint_printk_sysctl(struct ctl_table *table, int write,
+int tracepoint_printk_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp,
loff_t *ppos)
{
int save_tracepoint_printk;
int ret;
- mutex_lock(&tracepoint_printk_mutex);
+ guard(mutex)(&tracepoint_printk_mutex);
save_tracepoint_printk = tracepoint_printk;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
@@ -2781,16 +2909,13 @@ int tracepoint_printk_sysctl(struct ctl_table *table, int write,
tracepoint_printk = 0;
if (save_tracepoint_printk == tracepoint_printk)
- goto out;
+ return ret;
if (tracepoint_printk)
static_key_enable(&tracepoint_printk_key.key);
else
static_key_disable(&tracepoint_printk_key.key);
- out:
- mutex_unlock(&tracepoint_printk_mutex);
-
return ret;
}
@@ -2858,14 +2983,16 @@ trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
void
trace_function(struct trace_array *tr, unsigned long ip, unsigned long
- parent_ip, unsigned int trace_ctx)
+ parent_ip, unsigned int trace_ctx, struct ftrace_regs *fregs)
{
- struct trace_event_call *call = &event_function;
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct ftrace_entry *entry;
+ int size = sizeof(*entry);
- event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
+ size += FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long);
+
+ event = __trace_buffer_lock_reserve(buffer, TRACE_FN, size,
trace_ctx);
if (!event)
return;
@@ -2873,11 +3000,16 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long
entry->ip = ip;
entry->parent_ip = parent_ip;
- if (!call_filter_check_discard(call, entry, buffer, event)) {
- if (static_branch_unlikely(&trace_function_exports_enabled))
- ftrace_exports(event, TRACE_EXPORT_FUNCTION);
- __buffer_unlock_commit(buffer, event);
+#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+ if (fregs) {
+ for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++)
+ entry->args[i] = ftrace_regs_get_argument(fregs, i);
}
+#endif
+
+ if (static_branch_unlikely(&trace_function_exports_enabled))
+ ftrace_exports(event, TRACE_EXPORT_FUNCTION);
+ __buffer_unlock_commit(buffer, event);
}
#ifdef CONFIG_STACKTRACE
@@ -2885,7 +3017,7 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long
/* Allow 4 levels of nesting: normal, softirq, irq, NMI */
#define FTRACE_KSTACK_NESTING 4
-#define FTRACE_KSTACK_ENTRIES (PAGE_SIZE / FTRACE_KSTACK_NESTING)
+#define FTRACE_KSTACK_ENTRIES (SZ_4K / FTRACE_KSTACK_NESTING)
struct ftrace_stack {
unsigned long calls[FTRACE_KSTACK_ENTRIES];
@@ -2899,11 +3031,11 @@ struct ftrace_stacks {
static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks);
static DEFINE_PER_CPU(int, ftrace_stack_reserve);
-static void __ftrace_trace_stack(struct trace_buffer *buffer,
+static void __ftrace_trace_stack(struct trace_array *tr,
+ struct trace_buffer *buffer,
unsigned int trace_ctx,
int skip, struct pt_regs *regs)
{
- struct trace_event_call *call = &event_kernel_stack;
struct ring_buffer_event *event;
unsigned int size, nr_entries;
struct ftrace_stack *fstack;
@@ -2946,6 +3078,20 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
nr_entries = stack_trace_save(fstack->calls, size, skip);
}
+#ifdef CONFIG_DYNAMIC_FTRACE
+ /* Mark entry of stack trace as trampoline code */
+ if (tr->ops && tr->ops->trampoline) {
+ unsigned long tramp_start = tr->ops->trampoline;
+ unsigned long tramp_end = tramp_start + tr->ops->trampoline_size;
+ unsigned long *calls = fstack->calls;
+
+ for (int i = 0; i < nr_entries; i++) {
+ if (calls[i] >= tramp_start && calls[i] < tramp_end)
+ calls[i] = FTRACE_TRAMPOLINE_MARKER;
+ }
+ }
+#endif
+
event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
struct_size(entry, caller, nr_entries),
trace_ctx);
@@ -2957,8 +3103,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
memcpy(&entry->caller, fstack->calls,
flex_array_size(entry, caller, nr_entries));
- if (!call_filter_check_discard(call, entry, buffer, event))
- __buffer_unlock_commit(buffer, event);
+ __buffer_unlock_commit(buffer, event);
out:
/* Again, don't let gcc optimize things here */
@@ -2976,7 +3121,7 @@ static inline void ftrace_trace_stack(struct trace_array *tr,
if (!(tr->trace_flags & TRACE_ITER_STACKTRACE))
return;
- __ftrace_trace_stack(buffer, trace_ctx, skip, regs);
+ __ftrace_trace_stack(tr, buffer, trace_ctx, skip, regs);
}
void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
@@ -2985,7 +3130,7 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
struct trace_buffer *buffer = tr->array_buffer.buffer;
if (rcu_is_watching()) {
- __ftrace_trace_stack(buffer, trace_ctx, skip, NULL);
+ __ftrace_trace_stack(tr, buffer, trace_ctx, skip, NULL);
return;
}
@@ -3002,7 +3147,7 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
return;
ct_irq_enter_irqson();
- __ftrace_trace_stack(buffer, trace_ctx, skip, NULL);
+ __ftrace_trace_stack(tr, buffer, trace_ctx, skip, NULL);
ct_irq_exit_irqson();
}
@@ -3019,8 +3164,8 @@ void trace_dump_stack(int skip)
/* Skip 1 to skip this function. */
skip++;
#endif
- __ftrace_trace_stack(global_trace.array_buffer.buffer,
- tracing_gen_ctx(), skip, NULL);
+ __ftrace_trace_stack(printk_trace, printk_trace->array_buffer.buffer,
+ tracing_gen_ctx(), skip, NULL);
}
EXPORT_SYMBOL_GPL(trace_dump_stack);
@@ -3031,7 +3176,6 @@ static void
ftrace_trace_userstack(struct trace_array *tr,
struct trace_buffer *buffer, unsigned int trace_ctx)
{
- struct trace_event_call *call = &event_user_stack;
struct ring_buffer_event *event;
struct userstack_entry *entry;
@@ -3065,8 +3209,7 @@ ftrace_trace_userstack(struct trace_array *tr,
memset(&entry->caller, 0, sizeof(entry->caller));
stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES);
- if (!call_filter_check_discard(call, entry, buffer, event))
- __buffer_unlock_commit(buffer, event);
+ __buffer_unlock_commit(buffer, event);
out_drop_count:
__this_cpu_dec(user_stack_count);
@@ -3235,15 +3378,17 @@ static void trace_printk_start_stop_comm(int enabled)
*/
int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
{
- struct trace_event_call *call = &event_bprint;
struct ring_buffer_event *event;
struct trace_buffer *buffer;
- struct trace_array *tr = &global_trace;
+ struct trace_array *tr = READ_ONCE(printk_trace);
struct bprint_entry *entry;
unsigned int trace_ctx;
char *tbuffer;
int len = 0, size;
+ if (!printk_binsafe(tr))
+ return trace_vprintk(ip, fmt, args);
+
if (unlikely(tracing_selftest_running || tracing_disabled))
return 0;
@@ -3276,10 +3421,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
entry->fmt = fmt;
memcpy(entry->buf, tbuffer, sizeof(u32) * len);
- if (!call_filter_check_discard(call, entry, buffer, event)) {
- __buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL);
- }
+ __buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL);
out:
ring_buffer_nest_end(buffer);
@@ -3294,12 +3437,10 @@ out_nobuffer:
}
EXPORT_SYMBOL_GPL(trace_vbprintk);
-__printf(3, 0)
-static int
-__trace_array_vprintk(struct trace_buffer *buffer,
- unsigned long ip, const char *fmt, va_list args)
+static __printf(3, 0)
+int __trace_array_vprintk(struct trace_buffer *buffer,
+ unsigned long ip, const char *fmt, va_list args)
{
- struct trace_event_call *call = &event_print;
struct ring_buffer_event *event;
int len = 0, size;
struct print_entry *entry;
@@ -3334,10 +3475,8 @@ __trace_array_vprintk(struct trace_buffer *buffer,
entry->ip = ip;
memcpy(&entry->buf, tbuffer, len + 1);
- if (!call_filter_check_discard(call, entry, buffer, event)) {
- __buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(&global_trace, buffer, trace_ctx, 6, NULL);
- }
+ __buffer_unlock_commit(buffer, event);
+ ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL);
out:
ring_buffer_nest_end(buffer);
@@ -3350,7 +3489,6 @@ out_nobuffer:
return len;
}
-__printf(3, 0)
int trace_array_vprintk(struct trace_array *tr,
unsigned long ip, const char *fmt, va_list args)
{
@@ -3380,7 +3518,6 @@ int trace_array_vprintk(struct trace_array *tr,
* Note, trace_array_init_printk() must be called on @tr before this
* can be used.
*/
-__printf(3, 0)
int trace_array_printk(struct trace_array *tr,
unsigned long ip, const char *fmt, ...)
{
@@ -3425,14 +3562,13 @@ int trace_array_init_printk(struct trace_array *tr)
}
EXPORT_SYMBOL_GPL(trace_array_init_printk);
-__printf(3, 4)
int trace_array_printk_buf(struct trace_buffer *buffer,
unsigned long ip, const char *fmt, ...)
{
int ret;
va_list ap;
- if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
+ if (!(printk_trace->trace_flags & TRACE_ITER_PRINTK))
return 0;
va_start(ap, fmt);
@@ -3441,10 +3577,9 @@ int trace_array_printk_buf(struct trace_buffer *buffer,
return ret;
}
-__printf(2, 0)
int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
{
- return trace_array_vprintk(&global_trace, ip, fmt, args);
+ return trace_array_vprintk(printk_trace, ip, fmt, args);
}
EXPORT_SYMBOL_GPL(trace_vprintk);
@@ -3567,17 +3702,12 @@ char *trace_iter_expand_format(struct trace_iterator *iter)
}
/* Returns true if the string is safe to dereference from an event */
-static bool trace_safe_str(struct trace_iterator *iter, const char *str,
- bool star, int len)
+static bool trace_safe_str(struct trace_iterator *iter, const char *str)
{
unsigned long addr = (unsigned long)str;
struct trace_event *trace_event;
struct trace_event_call *event;
- /* Ignore strings with no length */
- if (star && !len)
- return true;
-
/* OK if part of the event data */
if ((addr >= (unsigned long)iter->ent) &&
(addr < (unsigned long)iter->ent + iter->ent_size))
@@ -3617,133 +3747,69 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str,
return false;
}
-static DEFINE_STATIC_KEY_FALSE(trace_no_verify);
-
-static int test_can_verify_check(const char *fmt, ...)
-{
- char buf[16];
- va_list ap;
- int ret;
-
- /*
- * The verifier is dependent on vsnprintf() modifies the va_list
- * passed to it, where it is sent as a reference. Some architectures
- * (like x86_32) passes it by value, which means that vsnprintf()
- * does not modify the va_list passed to it, and the verifier
- * would then need to be able to understand all the values that
- * vsnprintf can use. If it is passed by value, then the verifier
- * is disabled.
- */
- va_start(ap, fmt);
- vsnprintf(buf, 16, "%d", ap);
- ret = va_arg(ap, int);
- va_end(ap);
-
- return ret;
-}
-
-static void test_can_verify(void)
-{
- if (!test_can_verify_check("%d %d", 0, 1)) {
- pr_info("trace event string verifier disabled\n");
- static_branch_inc(&trace_no_verify);
- }
-}
-
/**
- * trace_check_vprintf - Check dereferenced strings while writing to the seq buffer
+ * ignore_event - Check dereferenced fields while writing to the seq buffer
* @iter: The iterator that holds the seq buffer and the event being printed
- * @fmt: The format used to print the event
- * @ap: The va_list holding the data to print from @fmt.
*
- * This writes the data into the @iter->seq buffer using the data from
- * @fmt and @ap. If the format has a %s, then the source of the string
- * is examined to make sure it is safe to print, otherwise it will
- * warn and print "[UNSAFE MEMORY]" in place of the dereferenced string
- * pointer.
+ * At boot up, test_event_printk() will flag any event that dereferences
+ * a string with "%s" that does exist in the ring buffer. It may still
+ * be valid, as the string may point to a static string in the kernel
+ * rodata that never gets freed. But if the string pointer is pointing
+ * to something that was allocated, there's a chance that it can be freed
+ * by the time the user reads the trace. This would cause a bad memory
+ * access by the kernel and possibly crash the system.
+ *
+ * This function will check if the event has any fields flagged as needing
+ * to be checked at runtime and perform those checks.
+ *
+ * If it is found that a field is unsafe, it will write into the @iter->seq
+ * a message stating what was found to be unsafe.
+ *
+ * @return: true if the event is unsafe and should be ignored,
+ * false otherwise.
*/
-void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
- va_list ap)
+bool ignore_event(struct trace_iterator *iter)
{
- const char *p = fmt;
- const char *str;
- int i, j;
+ struct ftrace_event_field *field;
+ struct trace_event *trace_event;
+ struct trace_event_call *event;
+ struct list_head *head;
+ struct trace_seq *seq;
+ const void *ptr;
- if (WARN_ON_ONCE(!fmt))
- return;
+ trace_event = ftrace_find_event(iter->ent->type);
- if (static_branch_unlikely(&trace_no_verify))
- goto print;
+ seq = &iter->seq;
- /* Don't bother checking when doing a ftrace_dump() */
- if (iter->fmt == static_fmt_buf)
- goto print;
+ if (!trace_event) {
+ trace_seq_printf(seq, "EVENT ID %d NOT FOUND?\n", iter->ent->type);
+ return true;
+ }
- while (*p) {
- bool star = false;
- int len = 0;
-
- j = 0;
-
- /* We only care about %s and variants */
- for (i = 0; p[i]; i++) {
- if (i + 1 >= iter->fmt_size) {
- /*
- * If we can't expand the copy buffer,
- * just print it.
- */
- if (!trace_iter_expand_format(iter))
- goto print;
- }
+ event = container_of(trace_event, struct trace_event_call, event);
+ if (!(event->flags & TRACE_EVENT_FL_TEST_STR))
+ return false;
- if (p[i] == '\\' && p[i+1]) {
- i++;
- continue;
- }
- if (p[i] == '%') {
- /* Need to test cases like %08.*s */
- for (j = 1; p[i+j]; j++) {
- if (isdigit(p[i+j]) ||
- p[i+j] == '.')
- continue;
- if (p[i+j] == '*') {
- star = true;
- continue;
- }
- break;
- }
- if (p[i+j] == 's')
- break;
- star = false;
- }
- j = 0;
- }
- /* If no %s found then just print normally */
- if (!p[i])
- break;
+ head = trace_get_fields(event);
+ if (!head) {
+ trace_seq_printf(seq, "FIELDS FOR EVENT '%s' NOT FOUND?\n",
+ trace_event_name(event));
+ return true;
+ }
- /* Copy up to the %s, and print that */
- strncpy(iter->fmt, p, i);
- iter->fmt[i] = '\0';
- trace_seq_vprintf(&iter->seq, iter->fmt, ap);
+ /* Offsets are from the iter->ent that points to the raw event */
+ ptr = iter->ent;
- /*
- * If iter->seq is full, the above call no longer guarantees
- * that ap is in sync with fmt processing, and further calls
- * to va_arg() can return wrong positional arguments.
- *
- * Ensure that ap is no longer used in this case.
- */
- if (iter->seq.full) {
- p = "";
- break;
- }
+ list_for_each_entry(field, head, link) {
+ const char *str;
+ bool good;
+
+ if (!field->needs_test)
+ continue;
- if (star)
- len = va_arg(ap, int);
+ str = *(const char **)(ptr + field->offset);
- /* The ap now points to the string data of the %s */
- str = va_arg(ap, const char *);
+ good = trace_safe_str(iter, str);
/*
* If you hit this warning, it is likely that the
@@ -3754,45 +3820,14 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
* instead. See samples/trace_events/trace-events-sample.h
* for reference.
*/
- if (WARN_ONCE(!trace_safe_str(iter, str, star, len),
- "fmt: '%s' current_buffer: '%s'",
- fmt, seq_buf_str(&iter->seq.seq))) {
- int ret;
-
- /* Try to safely read the string */
- if (star) {
- if (len + 1 > iter->fmt_size)
- len = iter->fmt_size - 1;
- if (len < 0)
- len = 0;
- ret = copy_from_kernel_nofault(iter->fmt, str, len);
- iter->fmt[len] = 0;
- star = false;
- } else {
- ret = strncpy_from_kernel_nofault(iter->fmt, str,
- iter->fmt_size);
- }
- if (ret < 0)
- trace_seq_printf(&iter->seq, "(0x%px)", str);
- else
- trace_seq_printf(&iter->seq, "(0x%px:%s)",
- str, iter->fmt);
- str = "[UNSAFE-MEMORY]";
- strcpy(iter->fmt, "%s");
- } else {
- strncpy(iter->fmt, p + i, j + 1);
- iter->fmt[j+1] = '\0';
+ if (WARN_ONCE(!good, "event '%s' has unsafe pointer field '%s'",
+ trace_event_name(event), field->name)) {
+ trace_seq_printf(seq, "EVENT %s: HAS UNSAFE POINTER FIELD '%s'\n",
+ trace_event_name(event), field->name);
+ return true;
}
- if (star)
- trace_seq_printf(&iter->seq, iter->fmt, len, str);
- else
- trace_seq_printf(&iter->seq, iter->fmt, str);
-
- p += i + j + 1;
}
- print:
- if (*p)
- trace_seq_vprintf(&iter->seq, p, ap);
+ return false;
}
const char *trace_event_format(struct trace_iterator *iter, const char *fmt)
@@ -3952,6 +3987,8 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
break;
entries++;
ring_buffer_iter_advance(buf_iter);
+ /* This could be a big loop */
+ cond_resched();
}
per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries;
@@ -4173,11 +4210,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
entries,
total,
buf->cpu,
- preempt_model_none() ? "server" :
- preempt_model_voluntary() ? "desktop" :
- preempt_model_full() ? "preempt" :
- preempt_model_rt() ? "preempt_rt" :
- "unknown",
+ preempt_model_str(),
/* These are reserved for later use */
0, 0, 0, 0);
#ifdef CONFIG_SMP
@@ -4260,6 +4293,15 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
if (event) {
if (tr->trace_flags & TRACE_ITER_FIELDS)
return print_event_fields(iter, event);
+ /*
+ * For TRACE_EVENT() events, the print_fmt is not
+ * safe to use if the array has delta offsets
+ * Force printing via the fields.
+ */
+ if ((tr->text_delta) &&
+ event->type > __TRACE_LAST_TYPE)
+ return print_event_fields(iter, event);
+
return event->funcs->trace(iter, sym_flags, event);
}
@@ -4915,6 +4957,11 @@ static int tracing_open(struct inode *inode, struct file *file)
static bool
trace_ok_for_array(struct tracer *t, struct trace_array *tr)
{
+#ifdef CONFIG_TRACER_SNAPSHOT
+ /* arrays with mapped buffer range do not have snapshots */
+ if (tr->range_addr_start && t->use_max_tr)
+ return false;
+#endif
return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances;
}
@@ -5007,7 +5054,7 @@ static int show_traces_open(struct inode *inode, struct file *file)
return 0;
}
-static int show_traces_release(struct inode *inode, struct file *file)
+static int tracing_seq_release(struct inode *inode, struct file *file)
{
struct trace_array *tr = inode->i_private;
@@ -5048,7 +5095,7 @@ static const struct file_operations show_traces_fops = {
.open = show_traces_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = show_traces_release,
+ .release = tracing_seq_release,
};
static ssize_t
@@ -5096,7 +5143,6 @@ int tracing_set_cpumask(struct trace_array *tr,
*/
if (cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
!cpumask_test_cpu(cpu, tracing_cpumask_new)) {
- atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled);
ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu);
#ifdef CONFIG_TRACER_MAX_TRACE
ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu);
@@ -5104,7 +5150,6 @@ int tracing_set_cpumask(struct trace_array *tr,
}
if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
cpumask_test_cpu(cpu, tracing_cpumask_new)) {
- atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled);
ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu);
#ifdef CONFIG_TRACER_MAX_TRACE
ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu);
@@ -5127,6 +5172,9 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
cpumask_var_t tracing_cpumask_new;
int err;
+ if (count == 0 || count > KMALLOC_MAX_SIZE)
+ return -EINVAL;
+
if (!zalloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
return -ENOMEM;
@@ -5163,7 +5211,8 @@ static int tracing_trace_options_show(struct seq_file *m, void *v)
u32 tracer_flags;
int i;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
+
tracer_flags = tr->current_trace->flags->val;
trace_opts = tr->current_trace->flags->opts;
@@ -5180,7 +5229,6 @@ static int tracing_trace_options_show(struct seq_file *m, void *v)
else
seq_printf(m, "no%s\n", trace_opts[i].name);
}
- mutex_unlock(&trace_types_lock);
return 0;
}
@@ -5233,7 +5281,9 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
{
if ((mask == TRACE_ITER_RECORD_TGID) ||
- (mask == TRACE_ITER_RECORD_CMD))
+ (mask == TRACE_ITER_RECORD_CMD) ||
+ (mask == TRACE_ITER_TRACE_PRINTK) ||
+ (mask == TRACE_ITER_COPY_MARKER))
lockdep_assert_held(&event_mutex);
/* do nothing if flag is already set */
@@ -5245,6 +5295,28 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
if (tr->current_trace->flag_changed(tr, mask, !!enabled))
return -EINVAL;
+ if (mask == TRACE_ITER_TRACE_PRINTK) {
+ if (enabled) {
+ update_printk_trace(tr);
+ } else {
+ /*
+ * The global_trace cannot clear this.
+ * It's flag only gets cleared if another instance sets it.
+ */
+ if (printk_trace == &global_trace)
+ return -EINVAL;
+ /*
+ * An instance must always have it set.
+ * by default, that's the global_trace instane.
+ */
+ if (printk_trace == tr)
+ update_printk_trace(&global_trace);
+ }
+ }
+
+ if (mask == TRACE_ITER_COPY_MARKER)
+ update_marker_trace(tr, enabled);
+
if (enabled)
tr->trace_flags |= mask;
else
@@ -5395,6 +5467,10 @@ static const struct file_operations tracing_iter_fops = {
static const char readme_msg[] =
"tracing mini-HOWTO:\n\n"
+ "By default tracefs removes all OTH file permission bits.\n"
+ "When mounting tracefs an optional group id can be specified\n"
+ "which adds the group to every directory and file in tracefs:\n\n"
+ "\t e.g. mount -t tracefs [-o [gid=<gid>]] nodev /sys/kernel/tracing\n\n"
"# echo 0 > tracing_on : quick way to disable tracing\n"
"# echo 1 > tracing_on : quick way to re-enable tracing\n\n"
" Important files:\n"
@@ -5540,7 +5616,7 @@ static const char readme_msg[] =
"\t kernel return probes support: $retval, $arg<N>, $comm\n"
"\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n"
"\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n"
- "\t symstr, <type>\\[<array-size>\\]\n"
+ "\t symstr, %pd/%pD, <type>\\[<array-size>\\]\n"
#ifdef CONFIG_HIST_TRIGGERS
"\t field: <stype> <name>;\n"
"\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n"
@@ -5549,6 +5625,8 @@ static const char readme_msg[] =
"\t efield: For event probes ('e' types), the field is on of the fields\n"
"\t of the <attached-group>/<attached-event>.\n"
#endif
+ " set_event\t\t- Enables events by name written into it\n"
+ "\t\t\t Can enable module events via: :mod:<module>\n"
" events/\t\t- Directory containing all trace event subsystems:\n"
" enable\t\t- Write 0/1 to enable/disable tracing of all events\n"
" events/<system>/\t- Directory containing all trace events for <system>:\n"
@@ -5821,7 +5899,7 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start,
return;
}
- mutex_lock(&trace_eval_mutex);
+ guard(mutex)(&trace_eval_mutex);
if (!trace_eval_maps)
trace_eval_maps = map_array;
@@ -5845,8 +5923,6 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start,
map_array++;
}
memset(map_array, 0, sizeof(*map_array));
-
- mutex_unlock(&trace_eval_mutex);
}
static void trace_create_eval_file(struct dentry *d_tracer)
@@ -6008,28 +6084,170 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
unsigned long size, int cpu_id)
{
- int ret;
-
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
if (cpu_id != RING_BUFFER_ALL_CPUS) {
/* make sure, this cpu is enabled in the mask */
- if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) {
- ret = -EINVAL;
- goto out;
- }
+ if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask))
+ return -EINVAL;
}
- ret = __tracing_resize_ring_buffer(tr, size, cpu_id);
- if (ret < 0)
- ret = -ENOMEM;
+ return __tracing_resize_ring_buffer(tr, size, cpu_id);
+}
-out:
- mutex_unlock(&trace_types_lock);
+struct trace_mod_entry {
+ unsigned long mod_addr;
+ char mod_name[MODULE_NAME_LEN];
+};
- return ret;
+struct trace_scratch {
+ unsigned int clock_id;
+ unsigned long text_addr;
+ unsigned long nr_entries;
+ struct trace_mod_entry entries[];
+};
+
+static DEFINE_MUTEX(scratch_mutex);
+
+static int cmp_mod_entry(const void *key, const void *pivot)
+{
+ unsigned long addr = (unsigned long)key;
+ const struct trace_mod_entry *ent = pivot;
+
+ if (addr >= ent[0].mod_addr && addr < ent[1].mod_addr)
+ return 0;
+ else
+ return addr - ent->mod_addr;
+}
+
+/**
+ * trace_adjust_address() - Adjust prev boot address to current address.
+ * @tr: Persistent ring buffer's trace_array.
+ * @addr: Address in @tr which is adjusted.
+ */
+unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr)
+{
+ struct trace_module_delta *module_delta;
+ struct trace_scratch *tscratch;
+ struct trace_mod_entry *entry;
+ unsigned long raddr;
+ int idx = 0, nr_entries;
+
+ /* If we don't have last boot delta, return the address */
+ if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT))
+ return addr;
+
+ /* tr->module_delta must be protected by rcu. */
+ guard(rcu)();
+ tscratch = tr->scratch;
+ /* if there is no tscrach, module_delta must be NULL. */
+ module_delta = READ_ONCE(tr->module_delta);
+ if (!module_delta || !tscratch->nr_entries ||
+ tscratch->entries[0].mod_addr > addr) {
+ raddr = addr + tr->text_delta;
+ return __is_kernel(raddr) || is_kernel_core_data(raddr) ||
+ is_kernel_rodata(raddr) ? raddr : addr;
+ }
+
+ /* Note that entries must be sorted. */
+ nr_entries = tscratch->nr_entries;
+ if (nr_entries == 1 ||
+ tscratch->entries[nr_entries - 1].mod_addr < addr)
+ idx = nr_entries - 1;
+ else {
+ entry = __inline_bsearch((void *)addr,
+ tscratch->entries,
+ nr_entries - 1,
+ sizeof(tscratch->entries[0]),
+ cmp_mod_entry);
+ if (entry)
+ idx = entry - tscratch->entries;
+ }
+
+ return addr + module_delta->delta[idx];
+}
+
+#ifdef CONFIG_MODULES
+static int save_mod(struct module *mod, void *data)
+{
+ struct trace_array *tr = data;
+ struct trace_scratch *tscratch;
+ struct trace_mod_entry *entry;
+ unsigned int size;
+
+ tscratch = tr->scratch;
+ if (!tscratch)
+ return -1;
+ size = tr->scratch_size;
+
+ if (struct_size(tscratch, entries, tscratch->nr_entries + 1) > size)
+ return -1;
+
+ entry = &tscratch->entries[tscratch->nr_entries];
+
+ tscratch->nr_entries++;
+
+ entry->mod_addr = (unsigned long)mod->mem[MOD_TEXT].base;
+ strscpy(entry->mod_name, mod->name);
+
+ return 0;
}
+#else
+static int save_mod(struct module *mod, void *data)
+{
+ return 0;
+}
+#endif
+
+static void update_last_data(struct trace_array *tr)
+{
+ struct trace_module_delta *module_delta;
+ struct trace_scratch *tscratch;
+
+ if (!(tr->flags & TRACE_ARRAY_FL_BOOT))
+ return;
+
+ if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT))
+ return;
+
+ /* Only if the buffer has previous boot data clear and update it. */
+ tr->flags &= ~TRACE_ARRAY_FL_LAST_BOOT;
+
+ /* Reset the module list and reload them */
+ if (tr->scratch) {
+ struct trace_scratch *tscratch = tr->scratch;
+
+ tscratch->clock_id = tr->clock_id;
+ memset(tscratch->entries, 0,
+ flex_array_size(tscratch, entries, tscratch->nr_entries));
+ tscratch->nr_entries = 0;
+
+ guard(mutex)(&scratch_mutex);
+ module_for_each_mod(save_mod, tr);
+ }
+
+ /*
+ * Need to clear all CPU buffers as there cannot be events
+ * from the previous boot mixed with events with this boot
+ * as that will cause a confusing trace. Need to clear all
+ * CPU buffers, even for those that may currently be offline.
+ */
+ tracing_reset_all_cpus(&tr->array_buffer);
+ /* Using current data now */
+ tr->text_delta = 0;
+
+ if (!tr->scratch)
+ return;
+
+ tscratch = tr->scratch;
+ module_delta = READ_ONCE(tr->module_delta);
+ WRITE_ONCE(tr->module_delta, NULL);
+ kfree_rcu(module_delta, rcu);
+
+ /* Set the persistent ring buffer meta data to this address */
+ tscratch->text_addr = (unsigned long)_text;
+}
/**
* tracing_update_buffers - used by tracing facility to expand ring buffers
@@ -6047,6 +6265,9 @@ int tracing_update_buffers(struct trace_array *tr)
int ret = 0;
mutex_lock(&trace_types_lock);
+
+ update_last_data(tr);
+
if (!tr->ring_buffer_expanded)
ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
@@ -6068,7 +6289,7 @@ static void tracing_set_nop(struct trace_array *tr)
{
if (tr->current_trace == &nop_trace)
return;
-
+
tr->current_trace->enabled--;
if (tr->current_trace->reset)
@@ -6098,15 +6319,17 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
#ifdef CONFIG_TRACER_MAX_TRACE
bool had_max_tr;
#endif
- int ret = 0;
+ int ret;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
+
+ update_last_data(tr);
if (!tr->ring_buffer_expanded) {
ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
if (ret < 0)
- goto out;
+ return ret;
ret = 0;
}
@@ -6114,43 +6337,37 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
if (strcmp(t->name, buf) == 0)
break;
}
- if (!t) {
- ret = -EINVAL;
- goto out;
- }
+ if (!t)
+ return -EINVAL;
+
if (t == tr->current_trace)
- goto out;
+ return 0;
#ifdef CONFIG_TRACER_SNAPSHOT
if (t->use_max_tr) {
local_irq_disable();
arch_spin_lock(&tr->max_lock);
- if (tr->cond_snapshot)
- ret = -EBUSY;
+ ret = tr->cond_snapshot ? -EBUSY : 0;
arch_spin_unlock(&tr->max_lock);
local_irq_enable();
if (ret)
- goto out;
+ return ret;
}
#endif
/* Some tracers won't work on kernel command line */
if (system_state < SYSTEM_RUNNING && t->noboot) {
pr_warn("Tracer '%s' is not allowed on command line, ignored\n",
t->name);
- goto out;
+ return -EINVAL;
}
/* Some tracers are only allowed for the top level buffer */
- if (!trace_ok_for_array(t, tr)) {
- ret = -EINVAL;
- goto out;
- }
+ if (!trace_ok_for_array(t, tr))
+ return -EINVAL;
/* If trace pipe files are being read, we can't change the tracer */
- if (tr->trace_ref) {
- ret = -EBUSY;
- goto out;
- }
+ if (tr->trace_ref)
+ return -EBUSY;
trace_branch_disable();
@@ -6181,7 +6398,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
if (!had_max_tr && t->use_max_tr) {
ret = tracing_arm_snapshot_locked(tr);
if (ret)
- goto out;
+ return ret;
}
#else
tr->current_trace = &nop_trace;
@@ -6194,17 +6411,15 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
if (t->use_max_tr)
tracing_disarm_snapshot(tr);
#endif
- goto out;
+ return ret;
}
}
tr->current_trace = t;
tr->current_trace->enabled++;
trace_branch_enable(tr);
- out:
- mutex_unlock(&trace_types_lock);
- return ret;
+ return 0;
}
static ssize_t
@@ -6282,22 +6497,18 @@ tracing_thresh_write(struct file *filp, const char __user *ubuf,
struct trace_array *tr = filp->private_data;
int ret;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos);
if (ret < 0)
- goto out;
+ return ret;
if (tr->current_trace->update_thresh) {
ret = tr->current_trace->update_thresh(tr);
if (ret < 0)
- goto out;
+ return ret;
}
- ret = cnt;
-out:
- mutex_unlock(&trace_types_lock);
-
- return ret;
+ return cnt;
}
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -6501,6 +6712,22 @@ static int tracing_wait_pipe(struct file *filp)
return 1;
}
+static bool update_last_data_if_empty(struct trace_array *tr)
+{
+ if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT))
+ return false;
+
+ if (!ring_buffer_empty(tr->array_buffer.buffer))
+ return false;
+
+ /*
+ * If the buffer contains the last boot data and all per-cpu
+ * buffers are empty, reset it from the kernel side.
+ */
+ update_last_data(tr);
+ return true;
+}
+
/*
* Consumer reader.
*/
@@ -6516,31 +6743,32 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
* This is just a matter of traces coherency, the ring buffer itself
* is protected.
*/
- mutex_lock(&iter->mutex);
+ guard(mutex)(&iter->mutex);
/* return any leftover data */
sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
if (sret != -EBUSY)
- goto out;
+ return sret;
trace_seq_init(&iter->seq);
if (iter->trace->read) {
sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
if (sret)
- goto out;
+ return sret;
}
waitagain:
+ if (update_last_data_if_empty(iter->tr))
+ return 0;
+
sret = tracing_wait_pipe(filp);
if (sret <= 0)
- goto out;
+ return sret;
/* stop when tracing is finished */
- if (trace_empty(iter)) {
- sret = 0;
- goto out;
- }
+ if (trace_empty(iter))
+ return 0;
if (cnt >= TRACE_SEQ_BUFFER_SIZE)
cnt = TRACE_SEQ_BUFFER_SIZE - 1;
@@ -6604,9 +6832,6 @@ waitagain:
if (sret == -EBUSY)
goto waitagain;
-out:
- mutex_unlock(&iter->mutex);
-
return sret;
}
@@ -6719,13 +6944,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
/* Copy the data into the page, so we can start over. */
ret = trace_seq_to_buffer(&iter->seq,
page_address(spd.pages[i]),
- trace_seq_used(&iter->seq));
+ min((size_t)trace_seq_used(&iter->seq),
+ (size_t)PAGE_SIZE));
if (ret < 0) {
__free_page(spd.pages[i]);
break;
}
spd.partial[i].offset = 0;
- spd.partial[i].len = trace_seq_used(&iter->seq);
+ spd.partial[i].len = ret;
trace_seq_init(&iter->seq);
}
@@ -6849,6 +7075,120 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
+#define LAST_BOOT_HEADER ((void *)1)
+
+static void *l_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct trace_array *tr = m->private;
+ struct trace_scratch *tscratch = tr->scratch;
+ unsigned int index = *pos;
+
+ (*pos)++;
+
+ if (*pos == 1)
+ return LAST_BOOT_HEADER;
+
+ /* Only show offsets of the last boot data */
+ if (!tscratch || !(tr->flags & TRACE_ARRAY_FL_LAST_BOOT))
+ return NULL;
+
+ /* *pos 0 is for the header, 1 is for the first module */
+ index--;
+
+ if (index >= tscratch->nr_entries)
+ return NULL;
+
+ return &tscratch->entries[index];
+}
+
+static void *l_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&scratch_mutex);
+
+ return l_next(m, NULL, pos);
+}
+
+static void l_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&scratch_mutex);
+}
+
+static void show_last_boot_header(struct seq_file *m, struct trace_array *tr)
+{
+ struct trace_scratch *tscratch = tr->scratch;
+
+ /*
+ * Do not leak KASLR address. This only shows the KASLR address of
+ * the last boot. When the ring buffer is started, the LAST_BOOT
+ * flag gets cleared, and this should only report "current".
+ * Otherwise it shows the KASLR address from the previous boot which
+ * should not be the same as the current boot.
+ */
+ if (tscratch && (tr->flags & TRACE_ARRAY_FL_LAST_BOOT))
+ seq_printf(m, "%lx\t[kernel]\n", tscratch->text_addr);
+ else
+ seq_puts(m, "# Current\n");
+}
+
+static int l_show(struct seq_file *m, void *v)
+{
+ struct trace_array *tr = m->private;
+ struct trace_mod_entry *entry = v;
+
+ if (v == LAST_BOOT_HEADER) {
+ show_last_boot_header(m, tr);
+ return 0;
+ }
+
+ seq_printf(m, "%lx\t%s\n", entry->mod_addr, entry->mod_name);
+ return 0;
+}
+
+static const struct seq_operations last_boot_seq_ops = {
+ .start = l_start,
+ .next = l_next,
+ .stop = l_stop,
+ .show = l_show,
+};
+
+static int tracing_last_boot_open(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+ struct seq_file *m;
+ int ret;
+
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
+
+ ret = seq_open(file, &last_boot_seq_ops);
+ if (ret) {
+ trace_array_put(tr);
+ return ret;
+ }
+
+ m = file->private_data;
+ m->private = tr;
+
+ return 0;
+}
+
+static int tracing_buffer_meta_open(struct inode *inode, struct file *filp)
+{
+ struct trace_array *tr = inode->i_private;
+ int cpu = tracing_get_cpu(inode);
+ int ret;
+
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
+
+ ret = ring_buffer_meta_seq_init(filp, tr->array_buffer.buffer, cpu);
+ if (ret < 0)
+ __trace_array_put(tr);
+ return ret;
+}
+
static ssize_t
tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
@@ -6881,11 +7221,9 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
#define TRACE_MARKER_MAX_SIZE 4096
-static ssize_t
-tracing_mark_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *fpos)
+static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user *ubuf,
+ size_t cnt, unsigned long ip)
{
- struct trace_array *tr = filp->private_data;
struct ring_buffer_event *event;
enum event_trigger_type tt = ETT_NONE;
struct trace_buffer *buffer;
@@ -6899,18 +7237,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
#define FAULTED_STR "<faulted>"
#define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */
- if (tracing_disabled)
- return -EINVAL;
-
- if (!(tr->trace_flags & TRACE_ITER_MARKERS))
- return -EINVAL;
-
- if ((ssize_t)cnt < 0)
- return -EINVAL;
-
- if (cnt > TRACE_MARKER_MAX_SIZE)
- cnt = TRACE_MARKER_MAX_SIZE;
-
meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */
again:
size = cnt + meta_size;
@@ -6943,7 +7269,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
}
entry = ring_buffer_event_data(event);
- entry->ip = _THIS_IP_;
+ entry->ip = ip;
len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt);
if (len) {
@@ -6976,18 +7302,12 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
}
static ssize_t
-tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
+tracing_mark_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
{
struct trace_array *tr = filp->private_data;
- struct ring_buffer_event *event;
- struct trace_buffer *buffer;
- struct raw_data_entry *entry;
- ssize_t written;
- int size;
- int len;
-
-#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int))
+ ssize_t written = -ENODEV;
+ unsigned long ip;
if (tracing_disabled)
return -EINVAL;
@@ -6995,10 +7315,42 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
if (!(tr->trace_flags & TRACE_ITER_MARKERS))
return -EINVAL;
- /* The marker must at least have a tag id */
- if (cnt < sizeof(unsigned int))
+ if ((ssize_t)cnt < 0)
return -EINVAL;
+ if (cnt > TRACE_MARKER_MAX_SIZE)
+ cnt = TRACE_MARKER_MAX_SIZE;
+
+ /* The selftests expect this function to be the IP address */
+ ip = _THIS_IP_;
+
+ /* The global trace_marker can go to multiple instances */
+ if (tr == &global_trace) {
+ guard(rcu)();
+ list_for_each_entry_rcu(tr, &marker_copies, marker_list) {
+ written = write_marker_to_buffer(tr, ubuf, cnt, ip);
+ if (written < 0)
+ break;
+ }
+ } else {
+ written = write_marker_to_buffer(tr, ubuf, cnt, ip);
+ }
+
+ return written;
+}
+
+static ssize_t write_raw_marker_to_buffer(struct trace_array *tr,
+ const char __user *ubuf, size_t cnt)
+{
+ struct ring_buffer_event *event;
+ struct trace_buffer *buffer;
+ struct raw_data_entry *entry;
+ ssize_t written;
+ int size;
+ int len;
+
+#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int))
+
size = sizeof(*entry) + cnt;
if (cnt < FAULT_SIZE_ID)
size += FAULT_SIZE_ID - cnt;
@@ -7029,6 +7381,40 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
return written;
}
+static ssize_t
+tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *fpos)
+{
+ struct trace_array *tr = filp->private_data;
+ ssize_t written = -ENODEV;
+
+#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int))
+
+ if (tracing_disabled)
+ return -EINVAL;
+
+ if (!(tr->trace_flags & TRACE_ITER_MARKERS))
+ return -EINVAL;
+
+ /* The marker must at least have a tag id */
+ if (cnt < sizeof(unsigned int))
+ return -EINVAL;
+
+ /* The global trace_marker_raw can go to multiple instances */
+ if (tr == &global_trace) {
+ guard(rcu)();
+ list_for_each_entry_rcu(tr, &marker_copies, marker_list) {
+ written = write_raw_marker_to_buffer(tr, ubuf, cnt);
+ if (written < 0)
+ break;
+ }
+ } else {
+ written = write_raw_marker_to_buffer(tr, ubuf, cnt);
+ }
+
+ return written;
+}
+
static int tracing_clock_show(struct seq_file *m, void *v)
{
struct trace_array *tr = m->private;
@@ -7073,6 +7459,12 @@ int tracing_set_clock(struct trace_array *tr, const char *clockstr)
tracing_reset_online_cpus(&tr->max_buffer);
#endif
+ if (tr->scratch && !(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) {
+ struct trace_scratch *tscratch = tr->scratch;
+
+ tscratch->clock_id = i;
+ }
+
mutex_unlock(&trace_types_lock);
return 0;
@@ -7167,25 +7559,19 @@ u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_eve
*/
int tracing_set_filter_buffering(struct trace_array *tr, bool set)
{
- int ret = 0;
-
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
if (set && tr->no_filter_buffering_ref++)
- goto out;
+ return 0;
if (!set) {
- if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) {
- ret = -EINVAL;
- goto out;
- }
+ if (WARN_ON_ONCE(!tr->no_filter_buffering_ref))
+ return -EINVAL;
--tr->no_filter_buffering_ref;
}
- out:
- mutex_unlock(&trace_types_lock);
- return ret;
+ return 0;
}
struct ftrace_buffer_info {
@@ -7261,12 +7647,10 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (ret)
return ret;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
- if (tr->current_trace->use_max_tr) {
- ret = -EBUSY;
- goto out;
- }
+ if (tr->current_trace->use_max_tr)
+ return -EBUSY;
local_irq_disable();
arch_spin_lock(&tr->max_lock);
@@ -7275,24 +7659,20 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
arch_spin_unlock(&tr->max_lock);
local_irq_enable();
if (ret)
- goto out;
+ return ret;
switch (val) {
case 0:
- if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
- ret = -EINVAL;
- break;
- }
+ if (iter->cpu_file != RING_BUFFER_ALL_CPUS)
+ return -EINVAL;
if (tr->allocated_snapshot)
free_snapshot(tr);
break;
case 1:
/* Only allow per-cpu swap if the ring buffer supports it */
#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP
- if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
- ret = -EINVAL;
- break;
- }
+ if (iter->cpu_file != RING_BUFFER_ALL_CPUS)
+ return -EINVAL;
#endif
if (tr->allocated_snapshot)
ret = resize_buffer_duplicate_size(&tr->max_buffer,
@@ -7300,7 +7680,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
ret = tracing_arm_snapshot_locked(tr);
if (ret)
- break;
+ return ret;
/* Now, we're going to swap */
if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
@@ -7327,8 +7707,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
*ppos += cnt;
ret = cnt;
}
-out:
- mutex_unlock(&trace_types_lock);
+
return ret;
}
@@ -7414,7 +7793,6 @@ static const struct file_operations tracing_pipe_fops = {
.read = tracing_read_pipe,
.splice_read = tracing_splice_read_pipe,
.release = tracing_release_pipe,
- .llseek = no_llseek,
};
static const struct file_operations tracing_entries_fops = {
@@ -7425,6 +7803,13 @@ static const struct file_operations tracing_entries_fops = {
.release = tracing_release_generic_tr,
};
+static const struct file_operations tracing_buffer_meta_fops = {
+ .open = tracing_buffer_meta_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = tracing_seq_release,
+};
+
static const struct file_operations tracing_total_entries_fops = {
.open = tracing_open_generic_tr,
.read = tracing_total_entries_read,
@@ -7465,6 +7850,13 @@ static const struct file_operations trace_time_stamp_mode_fops = {
.release = tracing_single_release_tr,
};
+static const struct file_operations last_boot_fops = {
+ .open = tracing_last_boot_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = tracing_seq_release,
+};
+
#ifdef CONFIG_TRACER_SNAPSHOT
static const struct file_operations snapshot_fops = {
.open = tracing_snapshot_open,
@@ -7479,7 +7871,6 @@ static const struct file_operations snapshot_raw_fops = {
.read = tracing_buffers_read,
.release = tracing_buffers_release,
.splice_read = tracing_buffers_splice_read,
- .llseek = no_llseek,
};
#endif /* CONFIG_TRACER_SNAPSHOT */
@@ -7702,12 +8093,11 @@ void tracing_log_err(struct trace_array *tr,
len += sizeof(CMD_PREFIX) + 2 * sizeof("\n") + strlen(cmd) + 1;
- mutex_lock(&tracing_err_log_lock);
+ guard(mutex)(&tracing_err_log_lock);
+
err = get_tracing_log_err(tr, len);
- if (PTR_ERR(err) == -ENOMEM) {
- mutex_unlock(&tracing_err_log_lock);
+ if (PTR_ERR(err) == -ENOMEM)
return;
- }
snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc);
snprintf(err->cmd, len, "\n" CMD_PREFIX "%s\n", cmd);
@@ -7718,7 +8108,6 @@ void tracing_log_err(struct trace_array *tr,
err->info.ts = local_clock();
list_add_tail(&err->list, &tr->err_log);
- mutex_unlock(&tracing_err_log_lock);
}
static void clear_tracing_err_log(struct trace_array *tr)
@@ -7950,7 +8339,10 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
trace_access_unlock(iter->cpu_file);
if (ret < 0) {
- if (trace_empty(iter)) {
+ if (trace_empty(iter) && !iter->closed) {
+ if (update_last_data_if_empty(iter->tr))
+ return 0;
+
if ((filp->f_flags & O_NONBLOCK))
return -EAGAIN;
@@ -8194,15 +8586,32 @@ out:
return ret;
}
-/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */
static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct ftrace_buffer_info *info = file->private_data;
struct trace_iterator *iter = &info->iter;
+ int err;
- if (cmd)
- return -ENOIOCTLCMD;
+ if (cmd == TRACE_MMAP_IOCTL_GET_READER) {
+ if (!(file->f_flags & O_NONBLOCK)) {
+ err = ring_buffer_wait(iter->array_buffer->buffer,
+ iter->cpu_file,
+ iter->tr->buffer_percent,
+ NULL, NULL);
+ if (err)
+ return err;
+ }
+
+ return ring_buffer_map_get_reader(iter->array_buffer->buffer,
+ iter->cpu_file);
+ } else if (cmd) {
+ return -ENOTTY;
+ }
+ /*
+ * An ioctl call with cmd 0 to the ring buffer file will wake up all
+ * waiters
+ */
mutex_lock(&trace_types_lock);
/* Make sure the waiters see the new wait_index */
@@ -8214,6 +8623,80 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned
return 0;
}
+#ifdef CONFIG_TRACER_MAX_TRACE
+static int get_snapshot_map(struct trace_array *tr)
+{
+ int err = 0;
+
+ /*
+ * Called with mmap_lock held. lockdep would be unhappy if we would now
+ * take trace_types_lock. Instead use the specific
+ * snapshot_trigger_lock.
+ */
+ spin_lock(&tr->snapshot_trigger_lock);
+
+ if (tr->snapshot || tr->mapped == UINT_MAX)
+ err = -EBUSY;
+ else
+ tr->mapped++;
+
+ spin_unlock(&tr->snapshot_trigger_lock);
+
+ /* Wait for update_max_tr() to observe iter->tr->mapped */
+ if (tr->mapped == 1)
+ synchronize_rcu();
+
+ return err;
+
+}
+static void put_snapshot_map(struct trace_array *tr)
+{
+ spin_lock(&tr->snapshot_trigger_lock);
+ if (!WARN_ON(!tr->mapped))
+ tr->mapped--;
+ spin_unlock(&tr->snapshot_trigger_lock);
+}
+#else
+static inline int get_snapshot_map(struct trace_array *tr) { return 0; }
+static inline void put_snapshot_map(struct trace_array *tr) { }
+#endif
+
+static void tracing_buffers_mmap_close(struct vm_area_struct *vma)
+{
+ struct ftrace_buffer_info *info = vma->vm_file->private_data;
+ struct trace_iterator *iter = &info->iter;
+
+ WARN_ON(ring_buffer_unmap(iter->array_buffer->buffer, iter->cpu_file));
+ put_snapshot_map(iter->tr);
+}
+
+static const struct vm_operations_struct tracing_buffers_vmops = {
+ .close = tracing_buffers_mmap_close,
+};
+
+static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct ftrace_buffer_info *info = filp->private_data;
+ struct trace_iterator *iter = &info->iter;
+ int ret = 0;
+
+ /* A memmap'ed buffer is not supported for user space mmap */
+ if (iter->tr->flags & TRACE_ARRAY_FL_MEMMAP)
+ return -ENODEV;
+
+ ret = get_snapshot_map(iter->tr);
+ if (ret)
+ return ret;
+
+ ret = ring_buffer_map(iter->array_buffer->buffer, iter->cpu_file, vma);
+ if (ret)
+ put_snapshot_map(iter->tr);
+
+ vma->vm_ops = &tracing_buffers_vmops;
+
+ return ret;
+}
+
static const struct file_operations tracing_buffers_fops = {
.open = tracing_buffers_open,
.read = tracing_buffers_read,
@@ -8222,7 +8705,7 @@ static const struct file_operations tracing_buffers_fops = {
.flush = tracing_buffers_flush,
.splice_read = tracing_buffers_splice_read,
.unlocked_ioctl = tracing_buffers_ioctl,
- .llseek = no_llseek,
+ .mmap = tracing_buffers_mmap,
};
static ssize_t
@@ -8306,15 +8789,22 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
char *buf;
int r;
- /* 256 should be plenty to hold the amount needed */
- buf = kmalloc(256, GFP_KERNEL);
+ /* 512 should be plenty to hold the amount needed */
+#define DYN_INFO_BUF_SIZE 512
+
+ buf = kmalloc(DYN_INFO_BUF_SIZE, GFP_KERNEL);
if (!buf)
return -ENOMEM;
- r = scnprintf(buf, 256, "%ld pages:%ld groups: %ld\n",
+ r = scnprintf(buf, DYN_INFO_BUF_SIZE,
+ "%ld pages:%ld groups: %ld\n"
+ "ftrace boot update time = %llu (ns)\n"
+ "ftrace module total update time = %llu (ns)\n",
ftrace_update_tot_cnt,
ftrace_number_of_pages,
- ftrace_number_of_groups);
+ ftrace_number_of_groups,
+ ftrace_update_time,
+ ftrace_total_mod_time);
ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
kfree(buf);
@@ -8569,12 +9059,17 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
trace_create_cpu_file("buffer_size_kb", TRACE_MODE_READ, d_cpu,
tr, cpu, &tracing_entries_fops);
+ if (tr->range_addr_start)
+ trace_create_cpu_file("buffer_meta", TRACE_MODE_READ, d_cpu,
+ tr, cpu, &tracing_buffer_meta_fops);
#ifdef CONFIG_TRACER_SNAPSHOT
- trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu,
- tr, cpu, &snapshot_fops);
+ if (!tr->range_addr_start) {
+ trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu,
+ tr, cpu, &snapshot_fops);
- trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu,
- tr, cpu, &snapshot_raw_fops);
+ trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu,
+ tr, cpu, &snapshot_raw_fops);
+ }
#endif
}
@@ -9102,16 +9597,142 @@ static struct dentry *trace_instance_dir;
static void
init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer);
+#ifdef CONFIG_MODULES
+static int make_mod_delta(struct module *mod, void *data)
+{
+ struct trace_module_delta *module_delta;
+ struct trace_scratch *tscratch;
+ struct trace_mod_entry *entry;
+ struct trace_array *tr = data;
+ int i;
+
+ tscratch = tr->scratch;
+ module_delta = READ_ONCE(tr->module_delta);
+ for (i = 0; i < tscratch->nr_entries; i++) {
+ entry = &tscratch->entries[i];
+ if (strcmp(mod->name, entry->mod_name))
+ continue;
+ if (mod->state == MODULE_STATE_GOING)
+ module_delta->delta[i] = 0;
+ else
+ module_delta->delta[i] = (unsigned long)mod->mem[MOD_TEXT].base
+ - entry->mod_addr;
+ break;
+ }
+ return 0;
+}
+#else
+static int make_mod_delta(struct module *mod, void *data)
+{
+ return 0;
+}
+#endif
+
+static int mod_addr_comp(const void *a, const void *b, const void *data)
+{
+ const struct trace_mod_entry *e1 = a;
+ const struct trace_mod_entry *e2 = b;
+
+ return e1->mod_addr > e2->mod_addr ? 1 : -1;
+}
+
+static void setup_trace_scratch(struct trace_array *tr,
+ struct trace_scratch *tscratch, unsigned int size)
+{
+ struct trace_module_delta *module_delta;
+ struct trace_mod_entry *entry;
+ int i, nr_entries;
+
+ if (!tscratch)
+ return;
+
+ tr->scratch = tscratch;
+ tr->scratch_size = size;
+
+ if (tscratch->text_addr)
+ tr->text_delta = (unsigned long)_text - tscratch->text_addr;
+
+ if (struct_size(tscratch, entries, tscratch->nr_entries) > size)
+ goto reset;
+
+ /* Check if each module name is a valid string */
+ for (i = 0; i < tscratch->nr_entries; i++) {
+ int n;
+
+ entry = &tscratch->entries[i];
+
+ for (n = 0; n < MODULE_NAME_LEN; n++) {
+ if (entry->mod_name[n] == '\0')
+ break;
+ if (!isprint(entry->mod_name[n]))
+ goto reset;
+ }
+ if (n == MODULE_NAME_LEN)
+ goto reset;
+ }
+
+ /* Sort the entries so that we can find appropriate module from address. */
+ nr_entries = tscratch->nr_entries;
+ sort_r(tscratch->entries, nr_entries, sizeof(struct trace_mod_entry),
+ mod_addr_comp, NULL, NULL);
+
+ if (IS_ENABLED(CONFIG_MODULES)) {
+ module_delta = kzalloc(struct_size(module_delta, delta, nr_entries), GFP_KERNEL);
+ if (!module_delta) {
+ pr_info("module_delta allocation failed. Not able to decode module address.");
+ goto reset;
+ }
+ init_rcu_head(&module_delta->rcu);
+ } else
+ module_delta = NULL;
+ WRITE_ONCE(tr->module_delta, module_delta);
+
+ /* Scan modules to make text delta for modules. */
+ module_for_each_mod(make_mod_delta, tr);
+
+ /* Set trace_clock as the same of the previous boot. */
+ if (tscratch->clock_id != tr->clock_id) {
+ if (tscratch->clock_id >= ARRAY_SIZE(trace_clocks) ||
+ tracing_set_clock(tr, trace_clocks[tscratch->clock_id].name) < 0) {
+ pr_info("the previous trace_clock info is not valid.");
+ goto reset;
+ }
+ }
+ return;
+ reset:
+ /* Invalid trace modules */
+ memset(tscratch, 0, size);
+}
+
static int
allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size)
{
enum ring_buffer_flags rb_flags;
+ struct trace_scratch *tscratch;
+ unsigned int scratch_size = 0;
rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
buf->tr = tr;
- buf->buffer = ring_buffer_alloc(size, rb_flags);
+ if (tr->range_addr_start && tr->range_addr_size) {
+ /* Add scratch buffer to handle 128 modules */
+ buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0,
+ tr->range_addr_start,
+ tr->range_addr_size,
+ struct_size(tscratch, entries, 128));
+
+ tscratch = ring_buffer_meta_scratch(buf->buffer, &scratch_size);
+ setup_trace_scratch(tr, tscratch, scratch_size);
+
+ /*
+ * This is basically the same as a mapped buffer,
+ * with the same restrictions.
+ */
+ tr->mapped++;
+ } else {
+ buf->buffer = ring_buffer_alloc(size, rb_flags);
+ }
if (!buf->buffer)
return -ENOMEM;
@@ -9148,6 +9769,10 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
return ret;
#ifdef CONFIG_TRACER_MAX_TRACE
+ /* Fix mapped buffer trace arrays do not have snapshot buffers */
+ if (tr->range_addr_start)
+ return 0;
+
ret = allocate_trace_buffer(tr, &tr->max_buffer,
allocate_snapshot ? size : 1);
if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) {
@@ -9168,6 +9793,7 @@ static void free_trace_buffers(struct trace_array *tr)
return;
free_trace_buffer(&tr->array_buffer);
+ kfree(tr->module_delta);
#ifdef CONFIG_TRACER_MAX_TRACE
free_trace_buffer(&tr->max_buffer);
@@ -9248,7 +9874,9 @@ static int trace_array_create_dir(struct trace_array *tr)
}
static struct trace_array *
-trace_array_create_systems(const char *name, const char *systems)
+trace_array_create_systems(const char *name, const char *systems,
+ unsigned long range_addr_start,
+ unsigned long range_addr_size)
{
struct trace_array *tr;
int ret;
@@ -9274,6 +9902,10 @@ trace_array_create_systems(const char *name, const char *systems)
goto out_free_tr;
}
+ /* Only for boot up memory mapped ring buffers */
+ tr->range_addr_start = range_addr_start;
+ tr->range_addr_size = range_addr_size;
+
tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9290,6 +9922,11 @@ trace_array_create_systems(const char *name, const char *systems)
INIT_LIST_HEAD(&tr->events);
INIT_LIST_HEAD(&tr->hist_vars);
INIT_LIST_HEAD(&tr->err_log);
+ INIT_LIST_HEAD(&tr->marker_list);
+
+#ifdef CONFIG_MODULES
+ INIT_LIST_HEAD(&tr->mod_events);
+#endif
if (allocate_trace_buffers(tr, trace_buf_size) < 0)
goto out_free_tr;
@@ -9323,6 +9960,7 @@ trace_array_create_systems(const char *name, const char *systems)
free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
kfree_const(tr->system_names);
+ kfree(tr->range_name);
kfree(tr->name);
kfree(tr);
@@ -9331,7 +9969,7 @@ trace_array_create_systems(const char *name, const char *systems)
static struct trace_array *trace_array_create(const char *name)
{
- return trace_array_create_systems(name, NULL);
+ return trace_array_create_systems(name, NULL, 0, 0);
}
static int instance_mkdir(const char *name)
@@ -9339,23 +9977,50 @@ static int instance_mkdir(const char *name)
struct trace_array *tr;
int ret;
- mutex_lock(&event_mutex);
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&event_mutex);
+ guard(mutex)(&trace_types_lock);
ret = -EEXIST;
if (trace_array_find(name))
- goto out_unlock;
+ return -EEXIST;
tr = trace_array_create(name);
ret = PTR_ERR_OR_ZERO(tr);
-out_unlock:
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
return ret;
}
+#ifdef CONFIG_MMU
+static u64 map_pages(unsigned long start, unsigned long size)
+{
+ unsigned long vmap_start, vmap_end;
+ struct vm_struct *area;
+ int ret;
+
+ area = get_vm_area(size, VM_IOREMAP);
+ if (!area)
+ return 0;
+
+ vmap_start = (unsigned long) area->addr;
+ vmap_end = vmap_start + size;
+
+ ret = vmap_page_range(vmap_start, vmap_end,
+ start, pgprot_nx(PAGE_KERNEL));
+ if (ret < 0) {
+ free_vm_area(area);
+ return 0;
+ }
+
+ return (u64)vmap_start;
+}
+#else
+static inline u64 map_pages(unsigned long start, unsigned long size)
+{
+ return 0;
+}
+#endif
+
/**
* trace_array_get_by_name - Create/Lookup a trace array, given its name.
* @name: The name of the trace array to be looked up/created.
@@ -9377,24 +10042,23 @@ struct trace_array *trace_array_get_by_name(const char *name, const char *system
{
struct trace_array *tr;
- mutex_lock(&event_mutex);
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&event_mutex);
+ guard(mutex)(&trace_types_lock);
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- if (tr->name && strcmp(tr->name, name) == 0)
- goto out_unlock;
+ if (tr->name && strcmp(tr->name, name) == 0) {
+ tr->ref++;
+ return tr;
+ }
}
- tr = trace_array_create_systems(name, systems);
+ tr = trace_array_create_systems(name, systems, 0, 0);
if (IS_ERR(tr))
tr = NULL;
-out_unlock:
- if (tr)
+ else
tr->ref++;
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
return tr;
}
EXPORT_SYMBOL_GPL(trace_array_get_by_name);
@@ -9415,6 +10079,12 @@ static int __remove_instance(struct trace_array *tr)
set_tracer_flag(tr, 1 << i, 0);
}
+ if (printk_trace == tr)
+ update_printk_trace(&global_trace);
+
+ if (update_marker_trace(tr, 0))
+ synchronize_rcu();
+
tracing_set_nop(tr);
clear_ftrace_function_probes(tr);
event_trace_del_tracer(tr);
@@ -9425,6 +10095,11 @@ static int __remove_instance(struct trace_array *tr)
free_trace_buffers(tr);
clear_tracing_err_log(tr);
+ if (tr->range_name) {
+ reserve_mem_release_by_name(tr->range_name);
+ kfree(tr->range_name);
+ }
+
for (i = 0; i < tr->nr_topts; i++) {
kfree(tr->topts[i].topts);
}
@@ -9442,48 +10117,36 @@ static int __remove_instance(struct trace_array *tr)
int trace_array_destroy(struct trace_array *this_tr)
{
struct trace_array *tr;
- int ret;
if (!this_tr)
return -EINVAL;
- mutex_lock(&event_mutex);
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&event_mutex);
+ guard(mutex)(&trace_types_lock);
- ret = -ENODEV;
/* Making sure trace array exists before destroying it. */
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- if (tr == this_tr) {
- ret = __remove_instance(tr);
- break;
- }
+ if (tr == this_tr)
+ return __remove_instance(tr);
}
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
-
- return ret;
+ return -ENODEV;
}
EXPORT_SYMBOL_GPL(trace_array_destroy);
static int instance_rmdir(const char *name)
{
struct trace_array *tr;
- int ret;
- mutex_lock(&event_mutex);
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&event_mutex);
+ guard(mutex)(&trace_types_lock);
- ret = -ENODEV;
tr = trace_array_find(name);
- if (tr)
- ret = __remove_instance(tr);
-
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
+ if (!tr)
+ return -ENODEV;
- return ret;
+ return __remove_instance(tr);
}
static __init void create_trace_instances(struct dentry *d_tracer)
@@ -9496,19 +10159,16 @@ static __init void create_trace_instances(struct dentry *d_tracer)
if (MEM_FAIL(!trace_instance_dir, "Failed to create instances directory\n"))
return;
- mutex_lock(&event_mutex);
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&event_mutex);
+ guard(mutex)(&trace_types_lock);
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
if (!tr->name)
continue;
if (MEM_FAIL(trace_array_create_dir(tr) < 0,
"Failed to create instance directory\n"))
- break;
+ return;
}
-
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
}
static void
@@ -9577,10 +10237,15 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
if (ftrace_create_function_files(tr, d_tracer))
MEM_FAIL(1, "Could not allocate function filter files");
+ if (tr->range_addr_start) {
+ trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer,
+ tr, &last_boot_fops);
#ifdef CONFIG_TRACER_SNAPSHOT
- trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer,
- tr, &snapshot_fops);
+ } else {
+ trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer,
+ tr, &snapshot_fops);
#endif
+ }
trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer,
tr, &tracing_err_log_fops);
@@ -9595,6 +10260,8 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
{
struct vfsmount *mnt;
struct file_system_type *type;
+ struct fs_context *fc;
+ int ret;
/*
* To maintain backward compatibility for tools that mount
@@ -9604,12 +10271,20 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
type = get_fs_type("tracefs");
if (!type)
return NULL;
- mnt = vfs_submount(mntpt, type, "tracefs", NULL);
+
+ fc = fs_context_for_submount(type, mntpt);
put_filesystem(type);
- if (IS_ERR(mnt))
- return NULL;
- mntget(mnt);
+ if (IS_ERR(fc))
+ return ERR_CAST(fc);
+
+ ret = vfs_parse_fs_string(fc, "source",
+ "tracefs", strlen("tracefs"));
+ if (!ret)
+ mnt = fc_mount(fc);
+ else
+ mnt = ERR_PTR(ret);
+ put_fs_context(fc);
return mnt;
}
@@ -9693,6 +10368,24 @@ late_initcall_sync(trace_eval_sync);
#ifdef CONFIG_MODULES
+
+bool module_exists(const char *module)
+{
+ /* All modules have the symbol __this_module */
+ static const char this_mod[] = "__this_module";
+ char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2];
+ unsigned long val;
+ int n;
+
+ n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod);
+
+ if (n > sizeof(modname) - 1)
+ return false;
+
+ val = module_kallsyms_lookup_name(modname);
+ return val != 0;
+}
+
static void trace_module_add_evals(struct module *mod)
{
if (!mod->num_trace_evals)
@@ -9717,7 +10410,7 @@ static void trace_module_remove_evals(struct module *mod)
if (!mod->num_trace_evals)
return;
- mutex_lock(&trace_eval_mutex);
+ guard(mutex)(&trace_eval_mutex);
map = trace_eval_maps;
@@ -9729,17 +10422,33 @@ static void trace_module_remove_evals(struct module *mod)
map = map->tail.next;
}
if (!map)
- goto out;
+ return;
*last = trace_eval_jmp_to_tail(map)->tail.next;
kfree(map);
- out:
- mutex_unlock(&trace_eval_mutex);
}
#else
static inline void trace_module_remove_evals(struct module *mod) { }
#endif /* CONFIG_TRACE_EVAL_MAP_FILE */
+static void trace_module_record(struct module *mod, bool add)
+{
+ struct trace_array *tr;
+ unsigned long flags;
+
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ flags = tr->flags & (TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT);
+ /* Update any persistent trace array that has already been started */
+ if (flags == TRACE_ARRAY_FL_BOOT && add) {
+ guard(mutex)(&scratch_mutex);
+ save_mod(mod, tr);
+ } else if (flags & TRACE_ARRAY_FL_LAST_BOOT) {
+ /* Update delta if the module loaded in previous boot */
+ make_mod_delta(mod, tr);
+ }
+ }
+}
+
static int trace_module_notify(struct notifier_block *self,
unsigned long val, void *data)
{
@@ -9748,9 +10457,11 @@ static int trace_module_notify(struct notifier_block *self,
switch (val) {
case MODULE_STATE_COMING:
trace_module_add_evals(mod);
+ trace_module_record(mod, true);
break;
case MODULE_STATE_GOING:
trace_module_remove_evals(mod);
+ trace_module_record(mod, false);
break;
}
@@ -9932,7 +10643,7 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m
static struct trace_iterator iter;
unsigned int old_userobj;
unsigned long flags;
- int cnt = 0, cpu;
+ int cnt = 0;
/*
* Always turn off tracing when we dump.
@@ -9949,9 +10660,8 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m
/* Simulate the iterator */
trace_init_iter(&iter, tr);
- for_each_tracing_cpu(cpu) {
- atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
- }
+ /* While dumping, do not allow the buffer to be enable */
+ tracer_tracing_disable(tr);
old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -10010,9 +10720,7 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m
tr->trace_flags |= old_userobj;
- for_each_tracing_cpu(cpu) {
- atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
- }
+ tracer_tracing_enable(tr);
local_irq_restore(flags);
}
@@ -10199,7 +10907,9 @@ static inline void do_allocate_snapshot(const char *name) { }
__init static void enable_instances(void)
{
struct trace_array *tr;
+ bool memmap_area = false;
char *curr_str;
+ char *name;
char *str;
char *tok;
@@ -10208,19 +10918,126 @@ __init static void enable_instances(void)
str = boot_instance_info;
while ((curr_str = strsep(&str, "\t"))) {
+ phys_addr_t start = 0;
+ phys_addr_t size = 0;
+ unsigned long addr = 0;
+ bool traceprintk = false;
+ bool traceoff = false;
+ char *flag_delim;
+ char *addr_delim;
+ char *rname __free(kfree) = NULL;
tok = strsep(&curr_str, ",");
- if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
- do_allocate_snapshot(tok);
+ flag_delim = strchr(tok, '^');
+ addr_delim = strchr(tok, '@');
- tr = trace_array_get_by_name(tok, NULL);
- if (!tr) {
- pr_warn("Failed to create instance buffer %s\n", curr_str);
+ if (addr_delim)
+ *addr_delim++ = '\0';
+
+ if (flag_delim)
+ *flag_delim++ = '\0';
+
+ name = tok;
+
+ if (flag_delim) {
+ char *flag;
+
+ while ((flag = strsep(&flag_delim, "^"))) {
+ if (strcmp(flag, "traceoff") == 0) {
+ traceoff = true;
+ } else if ((strcmp(flag, "printk") == 0) ||
+ (strcmp(flag, "traceprintk") == 0) ||
+ (strcmp(flag, "trace_printk") == 0)) {
+ traceprintk = true;
+ } else {
+ pr_info("Tracing: Invalid instance flag '%s' for %s\n",
+ flag, name);
+ }
+ }
+ }
+
+ tok = addr_delim;
+ if (tok && isdigit(*tok)) {
+ start = memparse(tok, &tok);
+ if (!start) {
+ pr_warn("Tracing: Invalid boot instance address for %s\n",
+ name);
+ continue;
+ }
+ if (*tok != ':') {
+ pr_warn("Tracing: No size specified for instance %s\n", name);
+ continue;
+ }
+ tok++;
+ size = memparse(tok, &tok);
+ if (!size) {
+ pr_warn("Tracing: Invalid boot instance size for %s\n",
+ name);
+ continue;
+ }
+ memmap_area = true;
+ } else if (tok) {
+ if (!reserve_mem_find_by_name(tok, &start, &size)) {
+ start = 0;
+ pr_warn("Failed to map boot instance %s to %s\n", name, tok);
+ continue;
+ }
+ rname = kstrdup(tok, GFP_KERNEL);
+ }
+
+ if (start) {
+ /* Start and size must be page aligned */
+ if (start & ~PAGE_MASK) {
+ pr_warn("Tracing: mapping start addr %pa is not page aligned\n", &start);
+ continue;
+ }
+ if (size & ~PAGE_MASK) {
+ pr_warn("Tracing: mapping size %pa is not page aligned\n", &size);
+ continue;
+ }
+
+ if (memmap_area)
+ addr = map_pages(start, size);
+ else
+ addr = (unsigned long)phys_to_virt(start);
+ if (addr) {
+ pr_info("Tracing: mapped boot instance %s at physical memory %pa of size 0x%lx\n",
+ name, &start, (unsigned long)size);
+ } else {
+ pr_warn("Tracing: Failed to map boot instance %s\n", name);
+ continue;
+ }
+ } else {
+ /* Only non mapped buffers have snapshot buffers */
+ if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
+ do_allocate_snapshot(name);
+ }
+
+ tr = trace_array_create_systems(name, NULL, addr, size);
+ if (IS_ERR(tr)) {
+ pr_warn("Tracing: Failed to create instance buffer %s\n", curr_str);
continue;
}
- /* Allow user space to delete it */
- trace_array_put(tr);
+
+ if (traceoff)
+ tracer_tracing_off(tr);
+
+ if (traceprintk)
+ update_printk_trace(tr);
+
+ /*
+ * memmap'd buffers can not be freed.
+ */
+ if (memmap_area) {
+ tr->flags |= TRACE_ARRAY_FL_MEMMAP;
+ tr->ref++;
+ }
+
+ if (start) {
+ tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT;
+ tr->range_name = no_free_ptr(rname);
+ }
while ((tok = strsep(&curr_str, ","))) {
early_enable_events(tr, tok, true);
@@ -10318,6 +11135,10 @@ __init static int tracer_alloc_buffers(void)
#endif
ftrace_init_global_array_ops(&global_trace);
+#ifdef CONFIG_MODULES
+ INIT_LIST_HEAD(&global_trace.mod_events);
+#endif
+
init_trace_flags_index(&global_trace);
register_tracer(&nop_trace);
@@ -10339,14 +11160,13 @@ __init static int tracer_alloc_buffers(void)
INIT_LIST_HEAD(&global_trace.events);
INIT_LIST_HEAD(&global_trace.hist_vars);
INIT_LIST_HEAD(&global_trace.err_log);
+ list_add(&global_trace.marker_list, &marker_copies);
list_add(&global_trace.list, &ftrace_trace_arrays);
apply_trace_boot_options();
register_snapshot_cmd();
- test_can_verify();
-
return 0;
out_free_pipe_cpumask:
@@ -10365,6 +11185,14 @@ out:
return ret;
}
+#ifdef CONFIG_FUNCTION_TRACER
+/* Used to set module cached ftrace filtering at boot up */
+__init struct trace_array *trace_get_global_array(void)
+{
+ return &global_trace;
+}
+#endif
+
void __init ftrace_boot_snapshot(void)
{
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -10453,6 +11281,9 @@ __init static int late_trace_init(void)
tracepoint_printk = 0;
}
+ if (traceoff_after_boot)
+ tracing_off();
+
tracing_set_default_clock();
clear_boot_tracer();
return 0;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 64450615ca0c..bd084953a98b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -21,6 +21,7 @@
#include <linux/workqueue.h>
#include <linux/ctype.h>
#include <linux/once_lite.h>
+#include <linux/ftrace_regs.h>
#include "pid_list.h"
@@ -46,6 +47,7 @@ enum trace_type {
TRACE_BRANCH,
TRACE_GRAPH_RET,
TRACE_GRAPH_ENT,
+ TRACE_GRAPH_RETADDR_ENT,
TRACE_USER_STACK,
TRACE_BLK,
TRACE_BPUTS,
@@ -181,8 +183,7 @@ struct trace_array;
* the trace, etc.)
*/
struct trace_array_cpu {
- atomic_t disabled;
- void *buffer_page; /* ring buffer spare */
+ local_t disabled;
unsigned long entries;
unsigned long saved_latency;
@@ -311,6 +312,11 @@ struct trace_func_repeats {
u64 ts_last_call;
};
+struct trace_module_delta {
+ struct rcu_head rcu;
+ long delta[];
+};
+
/*
* The trace array - an array of per-CPU trace arrays. This is the
* highest level data structure that individual tracers deal with.
@@ -343,6 +349,18 @@ struct trace_array {
struct irq_work fsnotify_irqwork;
#endif
#endif
+ /* The below is for memory mapped ring buffer */
+ unsigned int mapped;
+ unsigned long range_addr_start;
+ unsigned long range_addr_size;
+ char *range_name;
+ long text_delta;
+ struct trace_module_delta *module_delta;
+ void *scratch; /* pointer in persistent memory */
+ int scratch_size;
+
+ int buffer_disabled;
+
struct trace_pid_list __rcu *filtered_pids;
struct trace_pid_list __rcu *filtered_no_pids;
/*
@@ -359,7 +377,6 @@ struct trace_array {
* CONFIG_TRACER_MAX_TRACE.
*/
arch_spinlock_t max_lock;
- int buffer_disabled;
#ifdef CONFIG_FTRACE_SYSCALLS
int sys_refcount_enter;
int sys_refcount_exit;
@@ -386,16 +403,23 @@ struct trace_array {
struct trace_options *topts;
struct list_head systems;
struct list_head events;
+ struct list_head marker_list;
struct trace_event_file *trace_marker_file;
cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
/* one per_cpu trace_pipe can be opened by only one user */
cpumask_var_t pipe_cpumask;
int ref;
int trace_ref;
+#ifdef CONFIG_MODULES
+ struct list_head mod_events;
+#endif
#ifdef CONFIG_FUNCTION_TRACER
struct ftrace_ops *ops;
struct trace_pid_list __rcu *function_pids;
struct trace_pid_list __rcu *function_no_pids;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ struct fgraph_ops *gops;
+#endif
#ifdef CONFIG_DYNAMIC_FTRACE
/* All of these are protected by the ftrace_lock */
struct list_head func_probes;
@@ -419,9 +443,22 @@ struct trace_array {
};
enum {
- TRACE_ARRAY_FL_GLOBAL = (1 << 0)
+ TRACE_ARRAY_FL_GLOBAL = BIT(0),
+ TRACE_ARRAY_FL_BOOT = BIT(1),
+ TRACE_ARRAY_FL_LAST_BOOT = BIT(2),
+ TRACE_ARRAY_FL_MOD_INIT = BIT(3),
+ TRACE_ARRAY_FL_MEMMAP = BIT(4),
};
+#ifdef CONFIG_MODULES
+bool module_exists(const char *module);
+#else
+static inline bool module_exists(const char *module)
+{
+ return false;
+}
+#endif
+
extern struct list_head ftrace_trace_arrays;
extern struct mutex trace_types_lock;
@@ -437,6 +474,8 @@ extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);
extern bool trace_clock_in_ns(struct trace_array *tr);
+extern unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr);
+
/*
* The global tracer (top) should be the first trace array added,
* but we check the flag anyway.
@@ -501,6 +540,8 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \
TRACE_GRAPH_ENT); \
+ IF_ASSIGN(var, ent, struct fgraph_retaddr_ent_entry,\
+ TRACE_GRAPH_RETADDR_ENT); \
IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
TRACE_GRAPH_RET); \
IF_ASSIGN(var, ent, struct func_repeats_entry, \
@@ -624,12 +665,29 @@ bool tracing_is_disabled(void);
bool tracer_tracing_is_on(struct trace_array *tr);
void tracer_tracing_on(struct trace_array *tr);
void tracer_tracing_off(struct trace_array *tr);
+void tracer_tracing_disable(struct trace_array *tr);
+void tracer_tracing_enable(struct trace_array *tr);
struct dentry *trace_create_file(const char *name,
umode_t mode,
struct dentry *parent,
void *data,
const struct file_operations *fops);
+
+/**
+ * tracer_tracing_is_on_cpu - show real state of ring buffer enabled on for a cpu
+ * @tr : the trace array to know if ring buffer is enabled
+ * @cpu: The cpu buffer to check if enabled
+ *
+ * Shows real state of the per CPU buffer if it is enabled or not.
+ */
+static inline bool tracer_tracing_is_on_cpu(struct trace_array *tr, int cpu)
+{
+ if (tr->array_buffer.buffer)
+ return ring_buffer_record_is_on_cpu(tr->array_buffer.buffer, cpu);
+ return false;
+}
+
int tracing_init_dentry(void);
struct ring_buffer_event;
@@ -640,6 +698,8 @@ trace_buffer_lock_reserve(struct trace_buffer *buffer,
unsigned long len,
unsigned int trace_ctx);
+int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, int cpu);
+
struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
struct trace_array_cpu *data);
@@ -651,9 +711,8 @@ void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
bool trace_is_tracepoint_string(const char *str);
const char *trace_event_format(struct trace_iterator *iter, const char *fmt);
-void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
- va_list ap) __printf(2, 0);
char *trace_iter_expand_format(struct trace_iterator *iter);
+bool ignore_event(struct trace_iterator *iter);
int trace_empty(struct trace_iterator *iter);
@@ -669,7 +728,8 @@ unsigned long trace_total_entries(struct trace_array *tr);
void trace_function(struct trace_array *tr,
unsigned long ip,
unsigned long parent_ip,
- unsigned int trace_ctx);
+ unsigned int trace_ctx,
+ struct ftrace_regs *regs);
void trace_graph_function(struct trace_array *tr,
unsigned long ip,
unsigned long parent_ip,
@@ -678,9 +738,10 @@ void trace_latency_header(struct seq_file *m);
void trace_default_header(struct seq_file *m);
void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
-void trace_graph_return(struct ftrace_graph_ret *trace);
-int trace_graph_entry(struct ftrace_graph_ent *trace);
-void set_graph_array(struct trace_array *tr);
+void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops,
+ struct ftrace_regs *fregs);
+int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
+ struct ftrace_regs *fregs);
void tracing_start_cmdline_record(void);
void tracing_stop_cmdline_record(void);
@@ -703,8 +764,6 @@ extern unsigned long tracing_thresh;
/* PID filtering */
-extern int pid_max;
-
bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
pid_t search_pid);
bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
@@ -756,10 +815,14 @@ extern void trace_find_cmdline(int pid, char comm[]);
extern int trace_find_tgid(int pid);
extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
+extern int trace_events_enabled(struct trace_array *tr, const char *system);
+
#ifdef CONFIG_DYNAMIC_FTRACE
extern unsigned long ftrace_update_tot_cnt;
extern unsigned long ftrace_number_of_pages;
extern unsigned long ftrace_number_of_groups;
+extern u64 ftrace_update_time;
+extern u64 ftrace_total_mod_time;
void ftrace_init_trace_array(struct trace_array *tr);
#else
static inline void ftrace_init_trace_array(struct trace_array *tr) { }
@@ -807,13 +870,15 @@ static inline void __init disable_tracing_selftest(const char *reason)
extern void *head_page(struct trace_array_cpu *data);
extern unsigned long long ns2usecs(u64 nsec);
-extern int
-trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
-extern int
-trace_vprintk(unsigned long ip, const char *fmt, va_list args);
-extern int
-trace_array_vprintk(struct trace_array *tr,
- unsigned long ip, const char *fmt, va_list args);
+
+__printf(2, 0)
+int trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
+__printf(2, 0)
+int trace_vprintk(unsigned long ip, const char *fmt, va_list args);
+__printf(3, 0)
+int trace_array_vprintk(struct trace_array *tr,
+ unsigned long ip, const char *fmt, va_list args);
+__printf(3, 4)
int trace_array_printk_buf(struct trace_buffer *buffer,
unsigned long ip, const char *fmt, ...);
void trace_printk_seq(struct trace_seq *s);
@@ -867,6 +932,8 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
#define TRACE_GRAPH_GRAPH_TIME 0x400
#define TRACE_GRAPH_PRINT_RETVAL 0x800
#define TRACE_GRAPH_PRINT_RETVAL_HEX 0x1000
+#define TRACE_GRAPH_PRINT_RETADDR 0x2000
+#define TRACE_GRAPH_ARGS 0x4000
#define TRACE_GRAPH_PRINT_FILL_SHIFT 28
#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
@@ -888,15 +955,68 @@ extern void graph_trace_close(struct trace_iterator *iter);
extern int __trace_graph_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
unsigned int trace_ctx);
+extern int __trace_graph_retaddr_entry(struct trace_array *tr,
+ struct ftrace_graph_ent *trace,
+ unsigned int trace_ctx,
+ unsigned long retaddr);
extern void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret *trace,
- unsigned int trace_ctx);
+ unsigned int trace_ctx,
+ u64 calltime, u64 rettime);
+
+extern void init_array_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops);
+extern int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops);
+extern void free_fgraph_ops(struct trace_array *tr);
+
+enum {
+ TRACE_GRAPH_FL = 1,
+
+ /*
+ * In the very unlikely case that an interrupt came in
+ * at a start of graph tracing, and we want to trace
+ * the function in that interrupt, the depth can be greater
+ * than zero, because of the preempted start of a previous
+ * trace. In an even more unlikely case, depth could be 2
+ * if a softirq interrupted the start of graph tracing,
+ * followed by an interrupt preempting a start of graph
+ * tracing in the softirq, and depth can even be 3
+ * if an NMI came in at the start of an interrupt function
+ * that preempted a softirq start of a function that
+ * preempted normal context!!!! Luckily, it can't be
+ * greater than 3, so the next two bits are a mask
+ * of what the depth is when we set TRACE_GRAPH_FL
+ */
+
+ TRACE_GRAPH_DEPTH_START_BIT,
+ TRACE_GRAPH_DEPTH_END_BIT,
+
+ /*
+ * To implement set_graph_notrace, if this bit is set, we ignore
+ * function graph tracing of called functions, until the return
+ * function is called to clear it.
+ */
+ TRACE_GRAPH_NOTRACE_BIT,
+};
+
+#define TRACE_GRAPH_NOTRACE (1 << TRACE_GRAPH_NOTRACE_BIT)
+
+static inline unsigned long ftrace_graph_depth(unsigned long *task_var)
+{
+ return (*task_var >> TRACE_GRAPH_DEPTH_START_BIT) & 3;
+}
+
+static inline void ftrace_graph_set_depth(unsigned long *task_var, int depth)
+{
+ *task_var &= ~(3 << TRACE_GRAPH_DEPTH_START_BIT);
+ *task_var |= (depth & 3) << TRACE_GRAPH_DEPTH_START_BIT;
+}
#ifdef CONFIG_DYNAMIC_FTRACE
extern struct ftrace_hash __rcu *ftrace_graph_hash;
extern struct ftrace_hash __rcu *ftrace_graph_notrace_hash;
-static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
+static inline int
+ftrace_graph_addr(unsigned long *task_var, struct ftrace_graph_ent *trace)
{
unsigned long addr = trace->func;
int ret = 0;
@@ -918,13 +1038,12 @@ static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
}
if (ftrace_lookup_ip(hash, addr)) {
-
/*
* This needs to be cleared on the return functions
* when the depth is zero.
*/
- trace_recursion_set(TRACE_GRAPH_BIT);
- trace_recursion_set_depth(trace->depth);
+ *task_var |= TRACE_GRAPH_FL;
+ ftrace_graph_set_depth(task_var, trace->depth);
/*
* If no irqs are to be traced, but a set_graph_function
@@ -943,11 +1062,14 @@ out:
return ret;
}
-static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace)
+static inline void
+ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftrace_graph_ret *trace)
{
- if (trace_recursion_test(TRACE_GRAPH_BIT) &&
- trace->depth == trace_recursion_depth())
- trace_recursion_clear(TRACE_GRAPH_BIT);
+ unsigned long *task_var = fgraph_get_task_var(gops);
+
+ if ((*task_var & TRACE_GRAPH_FL) &&
+ trace->depth == ftrace_graph_depth(task_var))
+ *task_var &= ~TRACE_GRAPH_FL;
}
static inline int ftrace_graph_notrace_addr(unsigned long addr)
@@ -973,7 +1095,7 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr)
return ret;
}
#else
-static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
+static inline int ftrace_graph_addr(unsigned long *task_var, struct ftrace_graph_ent *trace)
{
return 1;
}
@@ -982,27 +1104,38 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr)
{
return 0;
}
-static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace)
+static inline void ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftrace_graph_ret *trace)
{ }
#endif /* CONFIG_DYNAMIC_FTRACE */
extern unsigned int fgraph_max_depth;
+extern bool fgraph_sleep_time;
-static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace)
+static inline bool
+ftrace_graph_ignore_func(struct fgraph_ops *gops, struct ftrace_graph_ent *trace)
{
+ unsigned long *task_var = fgraph_get_task_var(gops);
+
/* trace it when it is-nested-in or is a function enabled. */
- return !(trace_recursion_test(TRACE_GRAPH_BIT) ||
- ftrace_graph_addr(trace)) ||
+ return !((*task_var & TRACE_GRAPH_FL) ||
+ ftrace_graph_addr(task_var, trace)) ||
(trace->depth < 0) ||
(fgraph_max_depth && trace->depth >= fgraph_max_depth);
}
+void fgraph_init_ops(struct ftrace_ops *dst_ops,
+ struct ftrace_ops *src_ops);
+
#else /* CONFIG_FUNCTION_GRAPH_TRACER */
static inline enum print_line_t
print_graph_function_flags(struct trace_iterator *iter, u32 flags)
{
return TRACE_TYPE_UNHANDLED;
}
+static inline void free_fgraph_ops(struct trace_array *tr) { }
+/* ftrace_ops may not be defined */
+#define init_array_fgraph_ops(tr, ops) do { } while (0)
+#define allocate_fgraph_ops(tr, ops) ({ 0; })
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
extern struct list_head ftrace_pids;
@@ -1033,6 +1166,7 @@ void ftrace_destroy_function_files(struct trace_array *tr);
int ftrace_allocate_ftrace_ops(struct trace_array *tr);
void ftrace_free_ftrace_ops(struct trace_array *tr);
void ftrace_init_global_array_ops(struct trace_array *tr);
+struct trace_array *trace_get_global_array(void);
void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
void ftrace_reset_array_ops(struct trace_array *tr);
void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer);
@@ -1250,6 +1384,8 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
C(IRQ_INFO, "irq-info"), \
C(MARKERS, "markers"), \
C(EVENT_FORK, "event-fork"), \
+ C(TRACE_PRINTK, "trace_printk_dest"), \
+ C(COPY_MARKER, "copy_trace_marker"),\
C(PAUSE_ON_TRACE, "pause-on-trace"), \
C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \
FUNCTION_FLAGS \
@@ -1330,7 +1466,8 @@ struct ftrace_event_field {
int filter_type;
int offset;
int size;
- int is_signed;
+ unsigned int is_signed:1;
+ unsigned int needs_test:1;
int len;
};
@@ -1357,10 +1494,6 @@ struct trace_subsystem_dir {
int nr_events;
};
-extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
- struct trace_buffer *buffer,
- struct ring_buffer_event *event);
-
void trace_buffer_unlock_commit_regs(struct trace_array *tr,
struct trace_buffer *buffer,
struct ring_buffer_event *event,
@@ -1572,6 +1705,29 @@ static inline void *event_file_data(struct file *filp)
extern struct mutex event_mutex;
extern struct list_head ftrace_events;
+/*
+ * When the trace_event_file is the filp->i_private pointer,
+ * it must be taken under the event_mutex lock, and then checked
+ * if the EVENT_FILE_FL_FREED flag is set. If it is, then the
+ * data pointed to by the trace_event_file can not be trusted.
+ *
+ * Use the event_file_file() to access the trace_event_file from
+ * the filp the first time under the event_mutex and check for
+ * NULL. If it is needed to be retrieved again and the event_mutex
+ * is still held, then the event_file_data() can be used and it
+ * is guaranteed to be valid.
+ */
+static inline struct trace_event_file *event_file_file(struct file *filp)
+{
+ struct trace_event_file *file;
+
+ lockdep_assert_held(&event_mutex);
+ file = READ_ONCE(file_inode(filp)->i_private);
+ if (!file || file->flags & EVENT_FILE_FL_FREED)
+ return NULL;
+ return file;
+}
+
extern const struct file_operations event_trigger_fops;
extern const struct file_operations event_hist_fops;
extern const struct file_operations event_hist_debug_fops;
@@ -1596,7 +1752,7 @@ struct event_trigger_data {
unsigned long count;
int ref;
int flags;
- struct event_trigger_ops *ops;
+ const struct event_trigger_ops *ops;
struct event_command *cmd_ops;
struct event_filter __rcu *filter;
char *filter_str;
@@ -1634,6 +1790,9 @@ extern int event_enable_register_trigger(char *glob,
extern void event_enable_unregister_trigger(char *glob,
struct event_trigger_data *test,
struct trace_event_file *file);
+extern struct event_trigger_data *
+trigger_data_alloc(struct event_command *cmd_ops, char *cmd, char *param,
+ void *private_data);
extern void trigger_data_free(struct event_trigger_data *data);
extern int event_trigger_init(struct event_trigger_data *data);
extern int trace_event_trigger_enable_disable(struct trace_event_file *file,
@@ -1660,11 +1819,6 @@ extern bool event_trigger_check_remove(const char *glob);
extern bool event_trigger_empty_param(const char *param);
extern int event_trigger_separate_filter(char *param_and_filter, char **param,
char **filter, bool param_required);
-extern struct event_trigger_data *
-event_trigger_alloc(struct event_command *cmd_ops,
- char *cmd,
- char *param,
- void *private_data);
extern int event_trigger_parse_num(char *trigger,
struct event_trigger_data *trigger_data);
extern int event_trigger_set_filter(struct event_command *cmd_ops,
@@ -1841,7 +1995,7 @@ struct event_command {
int (*set_filter)(char *filter_str,
struct event_trigger_data *data,
struct trace_event_file *file);
- struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param);
+ const struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param);
};
/**
@@ -2081,4 +2235,11 @@ static inline int rv_init_interface(void)
}
#endif
+/*
+ * This is used only to distinguish
+ * function address from trampoline code.
+ * So this value has no meaning.
+ */
+#define FTRACE_TRAMPOLINE_MARKER ((unsigned long) INT_MAX)
+
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 811b08439406..e19c32f2a938 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -104,7 +104,7 @@ static void trace_do_benchmark(void)
stddev = 0;
delta = bm_total;
- delta = div64_u64(delta, bm_cnt);
+ do_div(delta, (u32)bm_cnt);
avg = delta;
if (stddev > 0) {
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index e47fdb4c92fb..6809b370e991 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -30,10 +30,8 @@ static struct trace_array *branch_tracer;
static void
probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
{
- struct trace_event_call *call = &event_branch;
struct trace_array *tr = branch_tracer;
struct trace_buffer *buffer;
- struct trace_array_cpu *data;
struct ring_buffer_event *event;
struct trace_branch *entry;
unsigned long flags;
@@ -55,8 +53,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
raw_local_irq_save(flags);
current->trace_recursion |= TRACE_BRANCH_BIT;
- data = this_cpu_ptr(tr->array_buffer.data);
- if (atomic_read(&data->disabled))
+ if (!tracer_tracing_is_on_cpu(tr, raw_smp_processor_id()))
goto out;
trace_ctx = tracing_gen_ctx_flags(flags);
@@ -74,16 +71,13 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
p--;
p++;
- strncpy(entry->func, f->data.func, TRACE_FUNC_SIZE);
- strncpy(entry->file, p, TRACE_FILE_SIZE);
- entry->func[TRACE_FUNC_SIZE] = 0;
- entry->file[TRACE_FILE_SIZE] = 0;
+ strscpy(entry->func, f->data.func);
+ strscpy(entry->file, p);
entry->constant = f->constant;
entry->line = f->data.line;
entry->correct = val == expect;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit_nostack(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
out:
current->trace_recursion &= ~TRACE_BRANCH_BIT;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 4702efb00ff2..4cb2ebc439be 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -154,5 +154,5 @@ static atomic64_t trace_counter;
*/
u64 notrace trace_clock_counter(void)
{
- return atomic64_add_return(1, &trace_counter);
+ return atomic64_inc_return(&trace_counter);
}
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index 4376887e0d8a..5d64a18cacac 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -16,7 +16,7 @@
#include "trace_output.h" /* for trace_event_sem */
#include "trace_dynevent.h"
-static DEFINE_MUTEX(dyn_event_ops_mutex);
+DEFINE_MUTEX(dyn_event_ops_mutex);
static LIST_HEAD(dyn_event_ops_list);
bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call)
@@ -74,24 +74,19 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type
struct dyn_event *pos, *n;
char *system = NULL, *event, *p;
int argc, ret = -ENOENT;
- char **argv;
+ char **argv __free(argv_free) = argv_split(GFP_KERNEL, raw_command, &argc);
- argv = argv_split(GFP_KERNEL, raw_command, &argc);
if (!argv)
return -ENOMEM;
if (argv[0][0] == '-') {
- if (argv[0][1] != ':') {
- ret = -EINVAL;
- goto out;
- }
+ if (argv[0][1] != ':')
+ return -EINVAL;
event = &argv[0][2];
} else {
event = strchr(argv[0], ':');
- if (!event) {
- ret = -EINVAL;
- goto out;
- }
+ if (!event)
+ return -EINVAL;
event++;
}
@@ -101,10 +96,8 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type
event = p + 1;
*p = '\0';
}
- if (!system && event[0] == '\0') {
- ret = -EINVAL;
- goto out;
- }
+ if (!system && event[0] == '\0')
+ return -EINVAL;
mutex_lock(&event_mutex);
for_each_dyn_event_safe(pos, n) {
@@ -120,8 +113,20 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type
}
tracing_reset_all_online_cpus();
mutex_unlock(&event_mutex);
-out:
- argv_free(argv);
+ return ret;
+}
+
+/*
+ * Locked version of event creation. The event creation must be protected by
+ * dyn_event_ops_mutex because of protecting trace_probe_log.
+ */
+int dyn_event_create(const char *raw_command, struct dyn_event_operations *type)
+{
+ int ret;
+
+ mutex_lock(&dyn_event_ops_mutex);
+ ret = type->create(raw_command);
+ mutex_unlock(&dyn_event_ops_mutex);
return ret;
}
diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h
index 936477a111d3..beee3f8d7544 100644
--- a/kernel/trace/trace_dynevent.h
+++ b/kernel/trace/trace_dynevent.h
@@ -100,6 +100,7 @@ void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos);
void dyn_event_seq_stop(struct seq_file *m, void *v);
int dyn_events_release_all(struct dyn_event_operations *type);
int dyn_event_release(const char *raw_command, struct dyn_event_operations *type);
+int dyn_event_create(const char *raw_command, struct dyn_event_operations *type);
/*
* for_each_dyn_event - iterate over the dyn_event list
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c47422b20908..de294ae2c5c5 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -61,8 +61,9 @@ FTRACE_ENTRY_REG(function, ftrace_entry,
TRACE_FN,
F_STRUCT(
- __field_fn( unsigned long, ip )
- __field_fn( unsigned long, parent_ip )
+ __field_fn( unsigned long, ip )
+ __field_fn( unsigned long, parent_ip )
+ __dynamic_array( unsigned long, args )
),
F_printk(" %ps <-- %ps",
@@ -72,22 +73,49 @@ FTRACE_ENTRY_REG(function, ftrace_entry,
);
/* Function call entry */
-FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
+FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
TRACE_GRAPH_ENT,
F_STRUCT(
__field_struct( struct ftrace_graph_ent, graph_ent )
__field_packed( unsigned long, graph_ent, func )
- __field_packed( int, graph_ent, depth )
+ __field_packed( unsigned int, graph_ent, depth )
+ __dynamic_array(unsigned long, args )
),
- F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth)
+ F_printk("--> %ps (%u)", (void *)__entry->func, __entry->depth)
);
-/* Function return entry */
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+
+/* Function call entry with a return address */
+FTRACE_ENTRY_PACKED(fgraph_retaddr_entry, fgraph_retaddr_ent_entry,
+
+ TRACE_GRAPH_RETADDR_ENT,
+
+ F_STRUCT(
+ __field_struct( struct fgraph_retaddr_ent, graph_ent )
+ __field_packed( unsigned long, graph_ent, func )
+ __field_packed( unsigned int, graph_ent, depth )
+ __field_packed( unsigned long, graph_ent, retaddr )
+ ),
+
+ F_printk("--> %ps (%u) <- %ps", (void *)__entry->func, __entry->depth,
+ (void *)__entry->retaddr)
+);
+
+#else
+
+#ifndef fgraph_retaddr_ent_entry
+#define fgraph_retaddr_ent_entry ftrace_graph_ent_entry
+#endif
+
+#endif
+
#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
+/* Function return entry */
FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
TRACE_GRAPH_RET,
@@ -96,13 +124,13 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
__field_struct( struct ftrace_graph_ret, ret )
__field_packed( unsigned long, ret, func )
__field_packed( unsigned long, ret, retval )
- __field_packed( int, ret, depth )
+ __field_packed( unsigned int, ret, depth )
__field_packed( unsigned int, ret, overrun )
- __field_packed( unsigned long long, ret, calltime)
- __field_packed( unsigned long long, ret, rettime )
+ __field(unsigned long long, calltime )
+ __field(unsigned long long, rettime )
),
- F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d retval: %lx",
+ F_printk("<-- %ps (%u) (start: %llx end: %llx) over: %u retval: %lx",
(void *)__entry->func, __entry->depth,
__entry->calltime, __entry->rettime,
__entry->depth, __entry->retval)
@@ -110,6 +138,7 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
#else
+/* Function return entry */
FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
TRACE_GRAPH_RET,
@@ -117,13 +146,13 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
F_STRUCT(
__field_struct( struct ftrace_graph_ret, ret )
__field_packed( unsigned long, ret, func )
- __field_packed( int, ret, depth )
+ __field_packed( unsigned int, ret, depth )
__field_packed( unsigned int, ret, overrun )
- __field_packed( unsigned long long, ret, calltime)
- __field_packed( unsigned long long, ret, rettime )
+ __field(unsigned long long, calltime )
+ __field(unsigned long long, rettime )
),
- F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d",
+ F_printk("<-- %ps (%u) (start: %llx end: %llx) over: %u",
(void *)__entry->func, __entry->depth,
__entry->calltime, __entry->rettime,
__entry->depth)
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index b0e0ec85912e..916555f0de81 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -478,7 +478,7 @@ static void eprobe_trigger_func(struct event_trigger_data *data,
__eprobe_trace_func(edata, rec);
}
-static struct event_trigger_ops eprobe_trigger_ops = {
+static const struct event_trigger_ops eprobe_trigger_ops = {
.trigger = eprobe_trigger_func,
.print = eprobe_trigger_print,
.init = eprobe_trigger_init,
@@ -507,8 +507,8 @@ static void eprobe_trigger_unreg_func(char *glob,
}
-static struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd,
- char *param)
+static const struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd,
+ char *param)
{
return &eprobe_trigger_ops;
}
@@ -912,10 +912,17 @@ static int __trace_eprobe_create(int argc, const char *argv[])
}
}
- mutex_lock(&event_mutex);
- event_call = find_and_get_event(sys_name, sys_event);
- ep = alloc_event_probe(group, event, event_call, argc - 2);
- mutex_unlock(&event_mutex);
+ if (argc - 2 > MAX_TRACE_ARGS) {
+ trace_probe_log_set_index(2);
+ trace_probe_log_err(0, TOO_MANY_ARGS);
+ ret = -E2BIG;
+ goto error;
+ }
+
+ scoped_guard(mutex, &event_mutex) {
+ event_call = find_and_get_event(sys_name, sys_event);
+ ep = alloc_event_probe(group, event, event_call, argc - 2);
+ }
if (IS_ERR(ep)) {
ret = PTR_ERR(ep);
@@ -937,7 +944,7 @@ static int __trace_eprobe_create(int argc, const char *argv[])
argc -= 2; argv += 2;
/* parse arguments */
- for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ for (i = 0; i < argc; i++) {
trace_probe_log_set_index(i + 2);
ret = trace_eprobe_tp_update_arg(ep, argv, i);
if (ret)
@@ -947,22 +954,28 @@ static int __trace_eprobe_create(int argc, const char *argv[])
if (ret < 0)
goto error;
init_trace_eprobe_call(ep);
- mutex_lock(&event_mutex);
- ret = trace_probe_register_event_call(&ep->tp);
- if (ret) {
- if (ret == -EEXIST) {
- trace_probe_log_set_index(0);
- trace_probe_log_err(0, EVENT_EXIST);
+ scoped_guard(mutex, &event_mutex) {
+ ret = trace_probe_register_event_call(&ep->tp);
+ if (ret) {
+ if (ret == -EEXIST) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, EVENT_EXIST);
+ }
+ goto error;
+ }
+ ret = dyn_event_add(&ep->devent, &ep->tp.event->call);
+ if (ret < 0) {
+ trace_probe_unregister_event_call(&ep->tp);
+ goto error;
}
- mutex_unlock(&event_mutex);
- goto error;
}
- ret = dyn_event_add(&ep->devent, &ep->tp.event->call);
- mutex_unlock(&event_mutex);
+ trace_probe_log_clear();
return ret;
+
parse_error:
ret = -EINVAL;
error:
+ trace_probe_log_clear();
trace_event_probe_cleanup(ep);
return ret;
}
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 05e791241812..a6bb7577e8c5 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -49,7 +49,7 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
/* The ftrace function trace is allowed only for root. */
if (ftrace_event_is_function(tp_event)) {
- ret = perf_allow_tracepoint(&p_event->attr);
+ ret = perf_allow_tracepoint();
if (ret)
return ret;
@@ -86,7 +86,7 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
* ...otherwise raw tracepoint data can be a severe data leak,
* only allow root to have these.
*/
- ret = perf_allow_tracepoint(&p_event->attr);
+ ret = perf_allow_tracepoint();
if (ret)
return ret;
@@ -352,10 +352,16 @@ void perf_uprobe_destroy(struct perf_event *p_event)
int perf_trace_add(struct perf_event *p_event, int flags)
{
struct trace_event_call *tp_event = p_event->tp_event;
+ struct hw_perf_event *hwc = &p_event->hw;
if (!(flags & PERF_EF_START))
p_event->hw.state = PERF_HES_STOPPED;
+ if (is_sampling_event(p_event)) {
+ hwc->last_period = hwc->sample_period;
+ perf_swevent_set_period(p_event);
+ }
+
/*
* If TRACE_REG_PERF_ADD returns false; no custom action was performed
* and we need to take the default action of enqueueing our event on
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 52f75c36bbca..120531268abf 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -82,7 +82,7 @@ static int system_refcount_dec(struct event_subsystem *system)
}
static struct ftrace_event_field *
-__find_event_field(struct list_head *head, char *name)
+__find_event_field(struct list_head *head, const char *name)
{
struct ftrace_event_field *field;
@@ -114,7 +114,8 @@ trace_find_event_field(struct trace_event_call *call, char *name)
static int __trace_define_field(struct list_head *head, const char *type,
const char *name, int offset, int size,
- int is_signed, int filter_type, int len)
+ int is_signed, int filter_type, int len,
+ int need_test)
{
struct ftrace_event_field *field;
@@ -133,6 +134,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
field->offset = offset;
field->size = size;
field->is_signed = is_signed;
+ field->needs_test = need_test;
field->len = len;
list_add(&field->link, head);
@@ -151,13 +153,13 @@ int trace_define_field(struct trace_event_call *call, const char *type,
head = trace_get_fields(call);
return __trace_define_field(head, type, name, offset, size,
- is_signed, filter_type, 0);
+ is_signed, filter_type, 0, 0);
}
EXPORT_SYMBOL_GPL(trace_define_field);
static int trace_define_field_ext(struct trace_event_call *call, const char *type,
const char *name, int offset, int size, int is_signed,
- int filter_type, int len)
+ int filter_type, int len, int need_test)
{
struct list_head *head;
@@ -166,13 +168,13 @@ static int trace_define_field_ext(struct trace_event_call *call, const char *typ
head = trace_get_fields(call);
return __trace_define_field(head, type, name, offset, size,
- is_signed, filter_type, len);
+ is_signed, filter_type, len, need_test);
}
#define __generic_field(type, item, filter_type) \
ret = __trace_define_field(&ftrace_generic_fields, #type, \
#item, 0, 0, is_signed_type(type), \
- filter_type, 0); \
+ filter_type, 0, 0); \
if (ret) \
return ret;
@@ -181,7 +183,8 @@ static int trace_define_field_ext(struct trace_event_call *call, const char *typ
"common_" #item, \
offsetof(typeof(ent), item), \
sizeof(ent.item), \
- is_signed_type(type), FILTER_OTHER, 0); \
+ is_signed_type(type), FILTER_OTHER, \
+ 0, 0); \
if (ret) \
return ret;
@@ -244,19 +247,16 @@ int trace_event_get_offsets(struct trace_event_call *call)
return tail->offset + tail->size;
}
-/*
- * Check if the referenced field is an array and return true,
- * as arrays are OK to dereference.
- */
-static bool test_field(const char *fmt, struct trace_event_call *call)
+
+static struct trace_event_fields *find_event_field(const char *fmt,
+ struct trace_event_call *call)
{
struct trace_event_fields *field = call->class->fields_array;
- const char *array_descriptor;
const char *p = fmt;
int len;
if (!(len = str_has_prefix(fmt, "REC->")))
- return false;
+ return NULL;
fmt += len;
for (p = fmt; *p; p++) {
if (!isalnum(*p) && *p != '_')
@@ -265,16 +265,155 @@ static bool test_field(const char *fmt, struct trace_event_call *call)
len = p - fmt;
for (; field->type; field++) {
- if (strncmp(field->name, fmt, len) ||
- field->name[len])
+ if (strncmp(field->name, fmt, len) || field->name[len])
continue;
- array_descriptor = strchr(field->type, '[');
- /* This is an array and is OK to dereference. */
- return array_descriptor != NULL;
+
+ return field;
+ }
+ return NULL;
+}
+
+/*
+ * Check if the referenced field is an array and return true,
+ * as arrays are OK to dereference.
+ */
+static bool test_field(const char *fmt, struct trace_event_call *call)
+{
+ struct trace_event_fields *field;
+
+ field = find_event_field(fmt, call);
+ if (!field)
+ return false;
+
+ /* This is an array and is OK to dereference. */
+ return strchr(field->type, '[') != NULL;
+}
+
+/* Look for a string within an argument */
+static bool find_print_string(const char *arg, const char *str, const char *end)
+{
+ const char *r;
+
+ r = strstr(arg, str);
+ return r && r < end;
+}
+
+/* Return true if the argument pointer is safe */
+static bool process_pointer(const char *fmt, int len, struct trace_event_call *call)
+{
+ const char *r, *e, *a;
+
+ e = fmt + len;
+
+ /* Find the REC-> in the argument */
+ r = strstr(fmt, "REC->");
+ if (r && r < e) {
+ /*
+ * Addresses of events on the buffer, or an array on the buffer is
+ * OK to dereference. There's ways to fool this, but
+ * this is to catch common mistakes, not malicious code.
+ */
+ a = strchr(fmt, '&');
+ if ((a && (a < r)) || test_field(r, call))
+ return true;
+ } else if (find_print_string(fmt, "__get_dynamic_array(", e)) {
+ return true;
+ } else if (find_print_string(fmt, "__get_rel_dynamic_array(", e)) {
+ return true;
+ } else if (find_print_string(fmt, "__get_dynamic_array_len(", e)) {
+ return true;
+ } else if (find_print_string(fmt, "__get_rel_dynamic_array_len(", e)) {
+ return true;
+ } else if (find_print_string(fmt, "__get_sockaddr(", e)) {
+ return true;
+ } else if (find_print_string(fmt, "__get_rel_sockaddr(", e)) {
+ return true;
}
return false;
}
+/* Return true if the string is safe */
+static bool process_string(const char *fmt, int len, struct trace_event_call *call)
+{
+ struct trace_event_fields *field;
+ const char *r, *e, *s;
+
+ e = fmt + len;
+
+ /*
+ * There are several helper functions that return strings.
+ * If the argument contains a function, then assume its field is valid.
+ * It is considered that the argument has a function if it has:
+ * alphanumeric or '_' before a parenthesis.
+ */
+ s = fmt;
+ do {
+ r = strstr(s, "(");
+ if (!r || r >= e)
+ break;
+ for (int i = 1; r - i >= s; i++) {
+ char ch = *(r - i);
+ if (isspace(ch))
+ continue;
+ if (isalnum(ch) || ch == '_')
+ return true;
+ /* Anything else, this isn't a function */
+ break;
+ }
+ /* A function could be wrapped in parethesis, try the next one */
+ s = r + 1;
+ } while (s < e);
+
+ /*
+ * Check for arrays. If the argument has: foo[REC->val]
+ * then it is very likely that foo is an array of strings
+ * that are safe to use.
+ */
+ r = strstr(s, "[");
+ if (r && r < e) {
+ r = strstr(r, "REC->");
+ if (r && r < e)
+ return true;
+ }
+
+ /*
+ * If there's any strings in the argument consider this arg OK as it
+ * could be: REC->field ? "foo" : "bar" and we don't want to get into
+ * verifying that logic here.
+ */
+ if (find_print_string(fmt, "\"", e))
+ return true;
+
+ /* Dereferenced strings are also valid like any other pointer */
+ if (process_pointer(fmt, len, call))
+ return true;
+
+ /* Make sure the field is found */
+ field = find_event_field(fmt, call);
+ if (!field)
+ return false;
+
+ /* Test this field's string before printing the event */
+ call->flags |= TRACE_EVENT_FL_TEST_STR;
+ field->needs_test = 1;
+
+ return true;
+}
+
+static void handle_dereference_arg(const char *arg_str, u64 string_flags, int len,
+ u64 *dereference_flags, int arg,
+ struct trace_event_call *call)
+{
+ if (string_flags & (1ULL << arg)) {
+ if (process_string(arg_str, len, call))
+ *dereference_flags &= ~(1ULL << arg);
+ } else if (process_pointer(arg_str, len, call))
+ *dereference_flags &= ~(1ULL << arg);
+ else
+ pr_warn("TRACE EVENT ERROR: Bad dereference argument: '%.*s'\n",
+ len, arg_str);
+}
+
/*
* Examine the print fmt of the event looking for unsafe dereference
* pointers using %p* that could be recorded in the trace event and
@@ -284,13 +423,14 @@ static bool test_field(const char *fmt, struct trace_event_call *call)
static void test_event_printk(struct trace_event_call *call)
{
u64 dereference_flags = 0;
+ u64 string_flags = 0;
bool first = true;
- const char *fmt, *c, *r, *a;
+ const char *fmt;
int parens = 0;
char in_quote = 0;
int start_arg = 0;
int arg = 0;
- int i;
+ int i, e;
fmt = call->print_fmt;
@@ -344,6 +484,7 @@ static void test_event_printk(struct trace_event_call *call)
case '%':
continue;
case 'p':
+ do_pointer:
/* Find dereferencing fields */
switch (fmt[i + 1]) {
case 'B': case 'R': case 'r':
@@ -372,10 +513,24 @@ static void test_event_printk(struct trace_event_call *call)
continue;
if (fmt[i + j] == '*') {
star = true;
+ /* Handle %*pbl case */
+ if (!j && fmt[i + 1] == 'p') {
+ arg++;
+ i++;
+ goto do_pointer;
+ }
continue;
}
- if ((fmt[i + j] == 's') && star)
- arg++;
+ if ((fmt[i + j] == 's')) {
+ if (star)
+ arg++;
+ if (WARN_ONCE(arg == 63,
+ "Too many args for event: %s",
+ trace_event_name(call)))
+ return;
+ dereference_flags |= 1ULL << arg;
+ string_flags |= 1ULL << arg;
+ }
break;
}
break;
@@ -403,42 +558,43 @@ static void test_event_printk(struct trace_event_call *call)
case ',':
if (in_quote || parens)
continue;
+ e = i;
i++;
while (isspace(fmt[i]))
i++;
- start_arg = i;
- if (!(dereference_flags & (1ULL << arg)))
- goto next_arg;
- /* Find the REC-> in the argument */
- c = strchr(fmt + i, ',');
- r = strstr(fmt + i, "REC->");
- if (r && (!c || r < c)) {
- /*
- * Addresses of events on the buffer,
- * or an array on the buffer is
- * OK to dereference.
- * There's ways to fool this, but
- * this is to catch common mistakes,
- * not malicious code.
- */
- a = strchr(fmt + i, '&');
- if ((a && (a < r)) || test_field(r, call))
- dereference_flags &= ~(1ULL << arg);
- } else if ((r = strstr(fmt + i, "__get_dynamic_array(")) &&
- (!c || r < c)) {
- dereference_flags &= ~(1ULL << arg);
- } else if ((r = strstr(fmt + i, "__get_sockaddr(")) &&
- (!c || r < c)) {
- dereference_flags &= ~(1ULL << arg);
+ /*
+ * If start_arg is zero, then this is the start of the
+ * first argument. The processing of the argument happens
+ * when the end of the argument is found, as it needs to
+ * handle paranthesis and such.
+ */
+ if (!start_arg) {
+ start_arg = i;
+ /* Balance out the i++ in the for loop */
+ i--;
+ continue;
}
- next_arg:
- i--;
+ if (dereference_flags & (1ULL << arg)) {
+ handle_dereference_arg(fmt + start_arg, string_flags,
+ e - start_arg,
+ &dereference_flags, arg, call);
+ }
+
+ start_arg = i;
arg++;
+ /* Balance out the i++ in the for loop */
+ i--;
}
}
+ if (dereference_flags & (1ULL << arg)) {
+ handle_dereference_arg(fmt + start_arg, string_flags,
+ i - start_arg,
+ &dereference_flags, arg, call);
+ }
+
/*
* If you triggered the below warning, the trace event reported
* uses an unsafe dereference pointer %p*. As the data stored
@@ -476,7 +632,6 @@ EXPORT_SYMBOL_GPL(trace_event_raw_init);
bool trace_event_ignore_this_pid(struct trace_event_file *trace_file)
{
struct trace_array *tr = trace_file->tr;
- struct trace_array_cpu *data;
struct trace_pid_list *no_pid_list;
struct trace_pid_list *pid_list;
@@ -486,9 +641,11 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file)
if (!pid_list && !no_pid_list)
return false;
- data = this_cpu_ptr(tr->array_buffer.data);
-
- return data->ignore_pid;
+ /*
+ * This is recorded at every sched_switch for this task.
+ * Thus, even if the task migrates the ignore value will be the same.
+ */
+ return this_cpu_read(tr->array_buffer.data->ignore_pid) != 0;
}
EXPORT_SYMBOL_GPL(trace_event_ignore_this_pid);
@@ -651,7 +808,9 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
}
- call->class->reg(call, TRACE_REG_UNREGISTER, file);
+ ret = call->class->reg(call, TRACE_REG_UNREGISTER, file);
+
+ WARN_ON_ONCE(ret);
}
/* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
if (file->flags & EVENT_FILE_FL_SOFT_MODE)
@@ -730,6 +889,120 @@ static int ftrace_event_enable_disable(struct trace_event_file *file,
return __ftrace_event_enable_disable(file, enable, 0);
}
+#ifdef CONFIG_MODULES
+struct event_mod_load {
+ struct list_head list;
+ char *module;
+ char *match;
+ char *system;
+ char *event;
+};
+
+static void free_event_mod(struct event_mod_load *event_mod)
+{
+ list_del(&event_mod->list);
+ kfree(event_mod->module);
+ kfree(event_mod->match);
+ kfree(event_mod->system);
+ kfree(event_mod->event);
+ kfree(event_mod);
+}
+
+static void clear_mod_events(struct trace_array *tr)
+{
+ struct event_mod_load *event_mod, *n;
+
+ list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) {
+ free_event_mod(event_mod);
+ }
+}
+
+static int remove_cache_mod(struct trace_array *tr, const char *mod,
+ const char *match, const char *system, const char *event)
+{
+ struct event_mod_load *event_mod, *n;
+ int ret = -EINVAL;
+
+ list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) {
+ if (strcmp(event_mod->module, mod) != 0)
+ continue;
+
+ if (match && strcmp(event_mod->match, match) != 0)
+ continue;
+
+ if (system &&
+ (!event_mod->system || strcmp(event_mod->system, system) != 0))
+ continue;
+
+ if (event &&
+ (!event_mod->event || strcmp(event_mod->event, event) != 0))
+ continue;
+
+ free_event_mod(event_mod);
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static int cache_mod(struct trace_array *tr, const char *mod, int set,
+ const char *match, const char *system, const char *event)
+{
+ struct event_mod_load *event_mod;
+
+ /* If the module exists, then this just failed to find an event */
+ if (module_exists(mod))
+ return -EINVAL;
+
+ /* See if this is to remove a cached filter */
+ if (!set)
+ return remove_cache_mod(tr, mod, match, system, event);
+
+ event_mod = kzalloc(sizeof(*event_mod), GFP_KERNEL);
+ if (!event_mod)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&event_mod->list);
+ event_mod->module = kstrdup(mod, GFP_KERNEL);
+ if (!event_mod->module)
+ goto out_free;
+
+ if (match) {
+ event_mod->match = kstrdup(match, GFP_KERNEL);
+ if (!event_mod->match)
+ goto out_free;
+ }
+
+ if (system) {
+ event_mod->system = kstrdup(system, GFP_KERNEL);
+ if (!event_mod->system)
+ goto out_free;
+ }
+
+ if (event) {
+ event_mod->event = kstrdup(event, GFP_KERNEL);
+ if (!event_mod->event)
+ goto out_free;
+ }
+
+ list_add(&event_mod->list, &tr->mod_events);
+
+ return 0;
+
+ out_free:
+ free_event_mod(event_mod);
+
+ return -ENOMEM;
+}
+#else /* CONFIG_MODULES */
+static inline void clear_mod_events(struct trace_array *tr) { }
+static int cache_mod(struct trace_array *tr, const char *mod, int set,
+ const char *match, const char *system, const char *event)
+{
+ return -EINVAL;
+}
+#endif
+
static void ftrace_clear_events(struct trace_array *tr)
{
struct trace_event_file *file;
@@ -738,6 +1011,7 @@ static void ftrace_clear_events(struct trace_array *tr)
list_for_each_entry(file, &tr->events, list) {
ftrace_event_enable_disable(file, 0);
}
+ clear_mod_events(tr);
mutex_unlock(&event_mutex);
}
@@ -992,18 +1266,18 @@ static void remove_subsystem(struct trace_subsystem_dir *dir)
void event_file_get(struct trace_event_file *file)
{
- atomic_inc(&file->ref);
+ refcount_inc(&file->ref);
}
void event_file_put(struct trace_event_file *file)
{
- if (WARN_ON_ONCE(!atomic_read(&file->ref))) {
+ if (WARN_ON_ONCE(!refcount_read(&file->ref))) {
if (file->flags & EVENT_FILE_FL_FREED)
kmem_cache_free(file_cachep, file);
return;
}
- if (atomic_dec_and_test(&file->ref)) {
+ if (refcount_dec_and_test(&file->ref)) {
/* Count should only go to zero when it is freed */
if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED)))
return;
@@ -1026,17 +1300,36 @@ static void remove_event_file_dir(struct trace_event_file *file)
*/
static int
__ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
- const char *sub, const char *event, int set)
+ const char *sub, const char *event, int set,
+ const char *mod)
{
struct trace_event_file *file;
struct trace_event_call *call;
+ char *module __free(kfree) = NULL;
const char *name;
int ret = -EINVAL;
int eret = 0;
+ if (mod) {
+ char *p;
+
+ module = kstrdup(mod, GFP_KERNEL);
+ if (!module)
+ return -ENOMEM;
+
+ /* Replace all '-' with '_' as that's what modules do */
+ for (p = strchr(module, '-'); p; p = strchr(p + 1, '-'))
+ *p = '_';
+ }
+
list_for_each_entry(file, &tr->events, list) {
call = file->event_call;
+
+ /* If a module is specified, skip events that are not that module */
+ if (module && (!call->module || strcmp(module_name(call->module), module)))
+ continue;
+
name = trace_event_name(call);
if (!name || !call->class || !call->class->reg)
@@ -1069,16 +1362,24 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
ret = eret;
}
+ /*
+ * If this is a module setting and nothing was found,
+ * check if the module was loaded. If it wasn't cache it.
+ */
+ if (module && ret == -EINVAL && !eret)
+ ret = cache_mod(tr, module, set, match, sub, event);
+
return ret;
}
static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
- const char *sub, const char *event, int set)
+ const char *sub, const char *event, int set,
+ const char *mod)
{
int ret;
mutex_lock(&event_mutex);
- ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set);
+ ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set, mod);
mutex_unlock(&event_mutex);
return ret;
@@ -1086,11 +1387,20 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
{
- char *event = NULL, *sub = NULL, *match;
+ char *event = NULL, *sub = NULL, *match, *mod;
int ret;
if (!tr)
return -ENOENT;
+
+ /* Modules events can be appened with :mod:<module> */
+ mod = strstr(buf, ":mod:");
+ if (mod) {
+ *mod = '\0';
+ /* move to the module name */
+ mod += 5;
+ }
+
/*
* The buf format can be <subsystem>:<event-name>
* *:<event-name> means any event by that name.
@@ -1113,9 +1423,13 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
sub = NULL;
if (!strlen(event) || strcmp(event, "*") == 0)
event = NULL;
+ } else if (mod) {
+ /* Allow wildcard for no length or star */
+ if (!strlen(match) || strcmp(match, "*") == 0)
+ match = NULL;
}
- ret = __ftrace_set_clr_event(tr, match, sub, event, set);
+ ret = __ftrace_set_clr_event(tr, match, sub, event, set, mod);
/* Put back the colon to allow this to be called again */
if (buf)
@@ -1143,7 +1457,7 @@ int trace_set_clr_event(const char *system, const char *event, int set)
if (!tr)
return -ENODEV;
- return __ftrace_set_clr_event(tr, NULL, system, event, set);
+ return __ftrace_set_clr_event(tr, NULL, system, event, set, NULL);
}
EXPORT_SYMBOL_GPL(trace_set_clr_event);
@@ -1169,7 +1483,7 @@ int trace_array_set_clr_event(struct trace_array *tr, const char *system,
return -ENOENT;
set = (enable == true) ? 1 : 0;
- return __ftrace_set_clr_event(tr, NULL, system, event, set);
+ return __ftrace_set_clr_event(tr, NULL, system, event, set, NULL);
}
EXPORT_SYMBOL_GPL(trace_array_set_clr_event);
@@ -1256,37 +1570,78 @@ static void *t_start(struct seq_file *m, loff_t *pos)
return file;
}
+enum set_event_iter_type {
+ SET_EVENT_FILE,
+ SET_EVENT_MOD,
+};
+
+struct set_event_iter {
+ enum set_event_iter_type type;
+ union {
+ struct trace_event_file *file;
+ struct event_mod_load *event_mod;
+ };
+};
+
static void *
s_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct trace_event_file *file = v;
+ struct set_event_iter *iter = v;
+ struct trace_event_file *file;
struct trace_array *tr = m->private;
(*pos)++;
- list_for_each_entry_continue(file, &tr->events, list) {
- if (file->flags & EVENT_FILE_FL_ENABLED)
- return file;
+ if (iter->type == SET_EVENT_FILE) {
+ file = iter->file;
+ list_for_each_entry_continue(file, &tr->events, list) {
+ if (file->flags & EVENT_FILE_FL_ENABLED) {
+ iter->file = file;
+ return iter;
+ }
+ }
+#ifdef CONFIG_MODULES
+ iter->type = SET_EVENT_MOD;
+ iter->event_mod = list_entry(&tr->mod_events, struct event_mod_load, list);
+#endif
}
+#ifdef CONFIG_MODULES
+ list_for_each_entry_continue(iter->event_mod, &tr->mod_events, list)
+ return iter;
+#endif
+
+ /*
+ * The iter is allocated in s_start() and passed via the 'v'
+ * parameter. To stop the iterator, NULL must be returned. But
+ * the return value is what the 'v' parameter in s_stop() receives
+ * and frees. Free iter here as it will no longer be used.
+ */
+ kfree(iter);
return NULL;
}
static void *s_start(struct seq_file *m, loff_t *pos)
{
- struct trace_event_file *file;
struct trace_array *tr = m->private;
+ struct set_event_iter *iter;
loff_t l;
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return NULL;
+
mutex_lock(&event_mutex);
- file = list_entry(&tr->events, struct trace_event_file, list);
+ iter->type = SET_EVENT_FILE;
+ iter->file = list_entry(&tr->events, struct trace_event_file, list);
+
for (l = 0; l <= *pos; ) {
- file = s_next(m, file, &l);
- if (!file)
+ iter = s_next(m, iter, &l);
+ if (!iter)
break;
}
- return file;
+ return iter;
}
static int t_show(struct seq_file *m, void *v)
@@ -1306,6 +1661,45 @@ static void t_stop(struct seq_file *m, void *p)
mutex_unlock(&event_mutex);
}
+#ifdef CONFIG_MODULES
+static int s_show(struct seq_file *m, void *v)
+{
+ struct set_event_iter *iter = v;
+ const char *system;
+ const char *event;
+
+ if (iter->type == SET_EVENT_FILE)
+ return t_show(m, iter->file);
+
+ /* When match is set, system and event are not */
+ if (iter->event_mod->match) {
+ seq_printf(m, "%s:mod:%s\n", iter->event_mod->match,
+ iter->event_mod->module);
+ return 0;
+ }
+
+ system = iter->event_mod->system ? : "*";
+ event = iter->event_mod->event ? : "*";
+
+ seq_printf(m, "%s:%s:mod:%s\n", system, event, iter->event_mod->module);
+
+ return 0;
+}
+#else /* CONFIG_MODULES */
+static int s_show(struct seq_file *m, void *v)
+{
+ struct set_event_iter *iter = v;
+
+ return t_show(m, iter->file);
+}
+#endif
+
+static void s_stop(struct seq_file *m, void *v)
+{
+ kfree(v);
+ t_stop(m, NULL);
+}
+
static void *
__next(struct seq_file *m, void *v, loff_t *pos, int type)
{
@@ -1386,12 +1780,12 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
char buf[4] = "0";
mutex_lock(&event_mutex);
- file = event_file_data(filp);
+ file = event_file_file(filp);
if (likely(file))
flags = file->flags;
mutex_unlock(&event_mutex);
- if (!file || flags & EVENT_FILE_FL_FREED)
+ if (!file)
return -ENODEV;
if (flags & EVENT_FILE_FL_ENABLED &&
@@ -1419,21 +1813,20 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (ret)
return ret;
+ guard(mutex)(&event_mutex);
+
switch (val) {
case 0:
case 1:
- ret = -ENODEV;
- mutex_lock(&event_mutex);
- file = event_file_data(filp);
- if (likely(file && !(file->flags & EVENT_FILE_FL_FREED))) {
- ret = tracing_update_buffers(file->tr);
- if (ret < 0) {
- mutex_unlock(&event_mutex);
- return ret;
- }
- ret = ftrace_event_enable_disable(file, val);
- }
- mutex_unlock(&event_mutex);
+ file = event_file_file(filp);
+ if (!file)
+ return -ENODEV;
+ ret = tracing_update_buffers(file->tr);
+ if (ret < 0)
+ return ret;
+ ret = ftrace_event_enable_disable(file, val);
+ if (ret < 0)
+ return ret;
break;
default:
@@ -1442,31 +1835,31 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
*ppos += cnt;
- return ret ? ret : cnt;
+ return cnt;
}
-static ssize_t
-system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
- loff_t *ppos)
+/*
+ * Returns:
+ * 0 : no events exist?
+ * 1 : all events are disabled
+ * 2 : all events are enabled
+ * 3 : some events are enabled and some are enabled
+ */
+int trace_events_enabled(struct trace_array *tr, const char *system)
{
- const char set_to_char[4] = { '?', '0', '1', 'X' };
- struct trace_subsystem_dir *dir = filp->private_data;
- struct event_subsystem *system = dir->subsystem;
struct trace_event_call *call;
struct trace_event_file *file;
- struct trace_array *tr = dir->tr;
- char buf[2];
int set = 0;
- int ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
+
list_for_each_entry(file, &tr->events, list) {
call = file->event_call;
if ((call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) ||
!trace_event_name(call) || !call->class || !call->class->reg)
continue;
- if (system && strcmp(call->class->system, system->name) != 0)
+ if (system && strcmp(call->class->system, system) != 0)
continue;
/*
@@ -1482,7 +1875,23 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
if (set == 3)
break;
}
- mutex_unlock(&event_mutex);
+
+ return set;
+}
+
+static ssize_t
+system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ const char set_to_char[4] = { '?', '0', '1', 'X' };
+ struct trace_subsystem_dir *dir = filp->private_data;
+ struct event_subsystem *system = dir->subsystem;
+ struct trace_array *tr = dir->tr;
+ char buf[2];
+ int set;
+ int ret;
+
+ set = trace_events_enabled(tr, system ? system->name : NULL);
buf[0] = set_to_char[set];
buf[1] = '\n';
@@ -1520,7 +1929,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (system)
name = system->name;
- ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val);
+ ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val, NULL);
if (ret)
goto out;
@@ -1540,7 +1949,8 @@ enum {
static void *f_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct trace_event_call *call = event_file_data(m->private);
+ struct trace_event_file *file = event_file_data(m->private);
+ struct trace_event_call *call = file->event_call;
struct list_head *common_head = &ftrace_common_fields;
struct list_head *head = trace_get_fields(call);
struct list_head *node = v;
@@ -1572,7 +1982,8 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos)
static int f_show(struct seq_file *m, void *v)
{
- struct trace_event_call *call = event_file_data(m->private);
+ struct trace_event_file *file = event_file_data(m->private);
+ struct trace_event_call *call = file->event_call;
struct ftrace_event_field *field;
const char *array_descriptor;
@@ -1627,12 +2038,14 @@ static int f_show(struct seq_file *m, void *v)
static void *f_start(struct seq_file *m, loff_t *pos)
{
+ struct trace_event_file *file;
void *p = (void *)FORMAT_HEADER;
loff_t l = 0;
/* ->stop() is called even if ->start() fails */
mutex_lock(&event_mutex);
- if (!event_file_data(m->private))
+ file = event_file_file(m->private);
+ if (!file)
return ERR_PTR(-ENODEV);
while (l < *pos && p)
@@ -1706,8 +2119,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
trace_seq_init(s);
mutex_lock(&event_mutex);
- file = event_file_data(filp);
- if (file && !(file->flags & EVENT_FILE_FL_FREED))
+ file = event_file_file(filp);
+ if (file)
print_event_filter(file, s);
mutex_unlock(&event_mutex);
@@ -1736,9 +2149,13 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
return PTR_ERR(buf);
mutex_lock(&event_mutex);
- file = event_file_data(filp);
- if (file)
- err = apply_event_filter(file, buf);
+ file = event_file_file(filp);
+ if (file) {
+ if (file->flags & EVENT_FILE_FL_FREED)
+ err = -ENODEV;
+ else
+ err = apply_event_filter(file, buf);
+ }
mutex_unlock(&event_mutex);
kfree(buf);
@@ -2010,7 +2427,7 @@ event_pid_write(struct file *filp, const char __user *ubuf,
if (ret < 0)
return ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
if (type == TRACE_PIDS) {
filtered_pids = rcu_dereference_protected(tr->filtered_pids,
@@ -2026,7 +2443,7 @@ event_pid_write(struct file *filp, const char __user *ubuf,
ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
if (ret < 0)
- goto out;
+ return ret;
if (type == TRACE_PIDS)
rcu_assign_pointer(tr->filtered_pids, pid_list);
@@ -2051,11 +2468,7 @@ event_pid_write(struct file *filp, const char __user *ubuf,
*/
on_each_cpu(ignore_task_cpu, tr, 1);
- out:
- mutex_unlock(&event_mutex);
-
- if (ret > 0)
- *ppos += ret;
+ *ppos += ret;
return ret;
}
@@ -2090,8 +2503,8 @@ static const struct seq_operations show_event_seq_ops = {
static const struct seq_operations show_set_event_seq_ops = {
.start = s_start,
.next = s_next,
- .show = t_show,
- .stop = t_stop,
+ .show = s_show,
+ .stop = s_stop,
};
static const struct seq_operations show_set_pid_seq_ops = {
@@ -2463,7 +2876,7 @@ event_define_fields(struct trace_event_call *call)
ret = trace_define_field_ext(call, field->type, field->name,
offset, field->size,
field->is_signed, field->filter_type,
- field->len);
+ field->len, field->needs_test);
if (WARN_ON_ONCE(ret)) {
pr_err("error code is %d\n", ret);
break;
@@ -2485,7 +2898,6 @@ static int event_callback(const char *name, umode_t *mode, void **data,
if (strcmp(name, "format") == 0) {
*mode = TRACE_MODE_READ;
*fops = &ftrace_event_format_fops;
- *data = call;
return 1;
}
@@ -2552,6 +2964,14 @@ static int event_callback(const char *name, umode_t *mode, void **data,
return 0;
}
+/* The file is incremented on creation and freeing the enable file decrements it */
+static void event_release(const char *name, void *data)
+{
+ struct trace_event_file *file = data;
+
+ event_file_put(file);
+}
+
static int
event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
{
@@ -2566,6 +2986,7 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
{
.name = "enable",
.callback = event_callback,
+ .release = event_release,
},
{
.name = "filter",
@@ -2634,6 +3055,9 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
return ret;
}
+ /* Gets decremented on freeing of the "enable" file */
+ event_file_get(file);
+
return 0;
}
@@ -2953,6 +3377,20 @@ static bool event_in_systems(struct trace_event_call *call,
return !*p || isspace(*p) || *p == ',';
}
+#ifdef CONFIG_HIST_TRIGGERS
+/*
+ * Wake up waiter on the hist_poll_wq from irq_work because the hist trigger
+ * may happen in any context.
+ */
+static void hist_poll_event_irq_work(struct irq_work *work)
+{
+ wake_up_all(&hist_poll_wq);
+}
+
+DEFINE_IRQ_WORK(hist_poll_work, hist_poll_event_irq_work);
+DECLARE_WAIT_QUEUE_HEAD(hist_poll_wq);
+#endif
+
static struct trace_event_file *
trace_create_new_event(struct trace_event_call *call,
struct trace_array *tr)
@@ -2984,7 +3422,7 @@ trace_create_new_event(struct trace_event_call *call,
atomic_set(&file->tm_ref, 0);
INIT_LIST_HEAD(&file->triggers);
list_add(&file->list, &tr->events);
- event_file_get(file);
+ refcount_set(&file->ref, 1);
return file;
}
@@ -3111,13 +3549,13 @@ int trace_add_event_call(struct trace_event_call *call)
int ret;
lockdep_assert_held(&event_mutex);
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
ret = __register_event(call, NULL);
- if (ret >= 0)
- __add_event_to_tracers(call);
+ if (ret < 0)
+ return ret;
- mutex_unlock(&trace_types_lock);
+ __add_event_to_tracers(call);
return ret;
}
EXPORT_SYMBOL_GPL(trace_add_event_call);
@@ -3130,8 +3568,6 @@ static void __trace_remove_event_call(struct trace_event_call *call)
{
event_remove(call);
trace_destroy_fields(call);
- free_event_filter(call->filter);
- call->filter = NULL;
}
static int probe_remove_event_call(struct trace_event_call *call)
@@ -3199,6 +3635,28 @@ EXPORT_SYMBOL_GPL(trace_remove_event_call);
event++)
#ifdef CONFIG_MODULES
+static void update_mod_cache(struct trace_array *tr, struct module *mod)
+{
+ struct event_mod_load *event_mod, *n;
+
+ list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) {
+ if (strcmp(event_mod->module, mod->name) != 0)
+ continue;
+
+ __ftrace_set_clr_event_nolock(tr, event_mod->match,
+ event_mod->system,
+ event_mod->event, 1, mod->name);
+ free_event_mod(event_mod);
+ }
+}
+
+static void update_cache_events(struct module *mod)
+{
+ struct trace_array *tr;
+
+ list_for_each_entry(tr, &ftrace_trace_arrays, list)
+ update_mod_cache(tr, mod);
+}
static void trace_module_add_events(struct module *mod)
{
@@ -3221,6 +3679,8 @@ static void trace_module_add_events(struct module *mod)
__register_event(*call, mod);
__add_event_to_tracers(*call);
}
+
+ update_cache_events(mod);
}
static void trace_module_remove_events(struct module *mod)
@@ -3373,30 +3833,21 @@ struct trace_event_file *trace_get_event_file(const char *instance,
return ERR_PTR(ret);
}
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
file = find_event_file(tr, system, event);
if (!file) {
trace_array_put(tr);
- ret = -EINVAL;
- goto out;
+ return ERR_PTR(-EINVAL);
}
/* Don't let event modules unload while in use */
ret = trace_event_try_get_ref(file->event_call);
if (!ret) {
trace_array_put(tr);
- ret = -EBUSY;
- goto out;
+ return ERR_PTR(-EBUSY);
}
- ret = 0;
- out:
- mutex_unlock(&event_mutex);
-
- if (ret)
- file = ERR_PTR(ret);
-
return file;
}
EXPORT_SYMBOL_GPL(trace_get_event_file);
@@ -3614,6 +4065,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash,
struct trace_event_file *file;
struct ftrace_probe_ops *ops;
struct event_probe_data *data;
+ unsigned long count = -1;
const char *system;
const char *event;
char *number;
@@ -3633,12 +4085,11 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash,
event = strsep(&param, ":");
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
- ret = -EINVAL;
file = find_event_file(tr, system, event);
if (!file)
- goto out;
+ return -EINVAL;
enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
@@ -3647,74 +4098,62 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash,
else
ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops;
- if (glob[0] == '!') {
- ret = unregister_ftrace_function_probe_func(glob+1, tr, ops);
- goto out;
- }
+ if (glob[0] == '!')
+ return unregister_ftrace_function_probe_func(glob+1, tr, ops);
- ret = -ENOMEM;
+ if (param) {
+ number = strsep(&param, ":");
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data)
- goto out;
+ if (!strlen(number))
+ return -EINVAL;
- data->enable = enable;
- data->count = -1;
- data->file = file;
-
- if (!param)
- goto out_reg;
-
- number = strsep(&param, ":");
-
- ret = -EINVAL;
- if (!strlen(number))
- goto out_free;
-
- /*
- * We use the callback data field (which is a pointer)
- * as our counter.
- */
- ret = kstrtoul(number, 0, &data->count);
- if (ret)
- goto out_free;
+ /*
+ * We use the callback data field (which is a pointer)
+ * as our counter.
+ */
+ ret = kstrtoul(number, 0, &count);
+ if (ret)
+ return ret;
+ }
- out_reg:
/* Don't let event modules unload while probe registered */
ret = trace_event_try_get_ref(file->event_call);
- if (!ret) {
- ret = -EBUSY;
- goto out_free;
- }
+ if (!ret)
+ return -EBUSY;
ret = __ftrace_event_enable_disable(file, 1, 1);
if (ret < 0)
goto out_put;
+ ret = -ENOMEM;
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ goto out_put;
+
+ data->enable = enable;
+ data->count = count;
+ data->file = file;
+
ret = register_ftrace_function_probe(glob, tr, ops, data);
/*
* The above returns on success the # of functions enabled,
* but if it didn't find any functions it returns zero.
* Consider no functions a failure too.
*/
- if (!ret) {
- ret = -ENOENT;
- goto out_disable;
- } else if (ret < 0)
- goto out_disable;
+
/* Just return zero, not the number of enabled functions */
- ret = 0;
- out:
- mutex_unlock(&event_mutex);
- return ret;
+ if (ret > 0)
+ return 0;
+
+ kfree(data);
+
+ if (!ret)
+ ret = -ENOENT;
- out_disable:
__ftrace_event_enable_disable(file, 0, 1);
out_put:
trace_event_put_ref(file->event_call);
- out_free:
- kfree(data);
- goto out;
+ return ret;
}
static struct ftrace_func_command event_enable_cmd = {
@@ -3937,20 +4376,17 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
{
int ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
ret = create_event_toplevel_files(parent, tr);
if (ret)
- goto out_unlock;
+ return ret;
down_write(&trace_event_sem);
__trace_early_add_event_dirs(tr);
up_write(&trace_event_sem);
- out_unlock:
- mutex_unlock(&event_mutex);
-
- return ret;
+ return 0;
}
/* Must be called with event_mutex held */
@@ -3965,7 +4401,7 @@ int event_trace_del_tracer(struct trace_array *tr)
__ftrace_clear_event_pids(tr, TRACE_PIDS | TRACE_NO_PIDS);
/* Disable any running events */
- __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
+ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0, NULL);
/* Make sure no more events are being executed */
tracepoint_synchronize_unregister();
@@ -4249,7 +4685,7 @@ static __init void event_trace_self_tests(void)
pr_info("Testing event system %s: ", system->name);
- ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);
+ ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1, NULL);
if (WARN_ON_ONCE(ret)) {
pr_warn("error enabling system %s\n",
system->name);
@@ -4258,7 +4694,7 @@ static __init void event_trace_self_tests(void)
event_test_stuff();
- ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);
+ ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0, NULL);
if (WARN_ON_ONCE(ret)) {
pr_warn("error disabling system %s\n",
system->name);
@@ -4273,7 +4709,7 @@ static __init void event_trace_self_tests(void)
pr_info("Running tests on all trace events:\n");
pr_info("Testing all events: ");
- ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);
+ ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1, NULL);
if (WARN_ON_ONCE(ret)) {
pr_warn("error enabling all events\n");
return;
@@ -4282,7 +4718,7 @@ static __init void event_trace_self_tests(void)
event_test_stuff();
/* reset sysname */
- ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
+ ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0, NULL);
if (WARN_ON_ONCE(ret)) {
pr_warn("error disabling all events\n");
return;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 0c611b281a5b..2048560264bb 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -808,7 +808,7 @@ static __always_inline char *test_string(char *str)
kstr = ubuf->buffer;
/* For safety, do not trust the string pointer */
- if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE))
+ if (strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE) < 0)
return NULL;
return kstr;
}
@@ -827,7 +827,7 @@ static __always_inline char *test_ustring(char *str)
/* user space address? */
ustr = (char __user *)str;
- if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE))
+ if (strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE) < 0)
return NULL;
return kstr;
@@ -1616,7 +1616,7 @@ static int parse_pred(const char *str, void *data,
goto err_free;
}
- strncpy(num_buf, str + s, len);
+ memcpy(num_buf, str + s, len);
num_buf[len] = 0;
ret = kstrtoul(num_buf, 0, &ip);
@@ -1694,7 +1694,7 @@ static int parse_pred(const char *str, void *data,
if (!pred->regex)
goto err_mem;
pred->regex->len = len;
- strncpy(pred->regex->pattern, str + s, len);
+ memcpy(pred->regex->pattern, str + s, len);
pred->regex->pattern[len] = 0;
} else if (!strncmp(str + i, "CPUS", 4)) {
@@ -1859,7 +1859,7 @@ static int parse_pred(const char *str, void *data,
if (!pred->regex)
goto err_mem;
pred->regex->len = len;
- strncpy(pred->regex->pattern, str + s, len);
+ memcpy(pred->regex->pattern, str + s, len);
pred->regex->pattern[len] = 0;
filter_build_regex(pred);
@@ -1919,7 +1919,7 @@ static int parse_pred(const char *str, void *data,
goto err_free;
}
- strncpy(num_buf, str + s, len);
+ memcpy(num_buf, str + s, len);
num_buf[len] = 0;
/* Make sure it is a value */
@@ -2405,13 +2405,11 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
struct event_filter *filter = NULL;
int err = 0;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
/* Make sure the system still has events */
- if (!dir->nr_events) {
- err = -ENODEV;
- goto out_unlock;
- }
+ if (!dir->nr_events)
+ return -ENODEV;
if (!strcmp(strstrip(filter_string), "0")) {
filter_free_subsystem_preds(dir, tr);
@@ -2422,7 +2420,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
tracepoint_synchronize_unregister();
filter_free_subsystem_filters(dir, tr);
__free_filter(filter);
- goto out_unlock;
+ return 0;
}
err = create_system_filter(dir, filter_string, &filter);
@@ -2434,8 +2432,6 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
__free_filter(system->filter);
system->filter = filter;
}
-out_unlock:
- mutex_unlock(&event_mutex);
return err;
}
@@ -2612,17 +2608,15 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
struct event_filter *filter = NULL;
struct trace_event_call *call;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
call = event->tp_event;
- err = -EINVAL;
if (!call)
- goto out_unlock;
+ return -EINVAL;
- err = -EEXIST;
if (event->filter)
- goto out_unlock;
+ return -EEXIST;
err = create_filter(NULL, call, filter_str, false, &filter);
if (err)
@@ -2637,9 +2631,6 @@ free_filter:
if (err || ftrace_event_is_function(call))
__free_filter(filter);
-out_unlock:
- mutex_unlock(&event_mutex);
-
return err;
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 6ece1308d36a..1d536219b624 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -114,6 +114,7 @@ enum hist_field_fn {
HIST_FIELD_FN_BUCKET,
HIST_FIELD_FN_TIMESTAMP,
HIST_FIELD_FN_CPU,
+ HIST_FIELD_FN_COMM,
HIST_FIELD_FN_STRING,
HIST_FIELD_FN_DYNSTRING,
HIST_FIELD_FN_RELDYNSTRING,
@@ -506,6 +507,7 @@ enum hist_field_flags {
HIST_FIELD_FL_CONST = 1 << 18,
HIST_FIELD_FL_PERCENT = 1 << 19,
HIST_FIELD_FL_GRAPH = 1 << 20,
+ HIST_FIELD_FL_COMM = 1 << 21,
};
struct var_defs {
@@ -822,7 +824,7 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
{
struct tracepoint *tp = event->tp;
- if (unlikely(atomic_read(&tp->key.enabled) > 0)) {
+ if (unlikely(static_key_enabled(&tp->key))) {
struct tracepoint_func *probe_func_ptr;
synth_probe_func_t probe_func;
void *__data;
@@ -885,6 +887,15 @@ static u64 hist_field_cpu(struct hist_field *hist_field,
return cpu;
}
+static u64 hist_field_comm(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ return (u64)(unsigned long)current->comm;
+}
+
/**
* check_field_for_var_ref - Check if a VAR_REF field references a variable
* @hist_field: The VAR_REF field to check
@@ -1338,6 +1349,8 @@ static const char *hist_field_name(struct hist_field *field,
field_name = hist_field_name(field->operands[0], ++level);
else if (field->flags & HIST_FIELD_FL_CPU)
field_name = "common_cpu";
+ else if (field->flags & HIST_FIELD_FL_COMM)
+ field_name = "common_comm";
else if (field->flags & HIST_FIELD_FL_EXPR ||
field->flags & HIST_FIELD_FL_VAR_REF) {
if (field->system) {
@@ -1354,10 +1367,7 @@ static const char *hist_field_name(struct hist_field *field,
} else if (field->flags & HIST_FIELD_FL_TIMESTAMP)
field_name = "common_timestamp";
else if (field->flags & HIST_FIELD_FL_STACKTRACE) {
- if (field->field)
- field_name = field->field->name;
- else
- field_name = "common_stacktrace";
+ field_name = "common_stacktrace";
} else if (field->flags & HIST_FIELD_FL_HITCOUNT)
field_name = "hitcount";
@@ -1599,7 +1609,7 @@ static inline void save_comm(char *comm, struct task_struct *task)
return;
}
- strncpy(comm, task->comm, TASK_COMM_LEN);
+ strscpy(comm, task->comm, TASK_COMM_LEN);
}
static void hist_elt_data_free(struct hist_elt_data *elt_data)
@@ -2018,6 +2028,13 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
goto out;
}
+ if (flags & HIST_FIELD_FL_COMM) {
+ hist_field->fn_num = HIST_FIELD_FN_COMM;
+ hist_field->size = MAX_FILTER_STR_VAL;
+ hist_field->type = "char[]";
+ goto out;
+ }
+
if (WARN_ON_ONCE(!field))
goto out;
@@ -2362,9 +2379,11 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
hist_data->attrs->ts_in_usecs = true;
} else if (strcmp(field_name, "common_stacktrace") == 0) {
*flags |= HIST_FIELD_FL_STACKTRACE;
- } else if (strcmp(field_name, "common_cpu") == 0)
+ } else if (strcmp(field_name, "common_cpu") == 0) {
*flags |= HIST_FIELD_FL_CPU;
- else if (strcmp(field_name, "hitcount") == 0)
+ } else if (strcmp(field_name, "common_comm") == 0) {
+ *flags |= HIST_FIELD_FL_COMM | HIST_FIELD_FL_STRING;
+ } else if (strcmp(field_name, "hitcount") == 0)
*flags |= HIST_FIELD_FL_HITCOUNT;
else {
field = trace_find_event_field(file->event_call, field_name);
@@ -2380,6 +2399,8 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
*flags |= HIST_FIELD_FL_CPU;
} else if (field && field->filter_type == FILTER_STACKTRACE) {
*flags |= HIST_FIELD_FL_STACKTRACE;
+ } else if (field && field->filter_type == FILTER_COMM) {
+ *flags |= HIST_FIELD_FL_COMM | HIST_FIELD_FL_STRING;
} else {
hist_err(tr, HIST_ERR_FIELD_NOT_FOUND,
errpos(field_name));
@@ -3405,7 +3426,7 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data)
elt_data = context->elt->private_data;
track_elt_data = track_data->elt.private_data;
if (elt_data->comm)
- strncpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN);
+ strscpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN);
track_data->updated = true;
@@ -4330,6 +4351,8 @@ static u64 hist_fn_call(struct hist_field *hist_field,
return hist_field_timestamp(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_CPU:
return hist_field_cpu(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_COMM:
+ return hist_field_comm(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_STRING:
return hist_field_string(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_DYNSTRING:
@@ -5215,22 +5238,25 @@ static inline void add_to_key(char *compound_key, void *key,
size_t size = key_field->size;
if (key_field->flags & HIST_FIELD_FL_STRING) {
- struct ftrace_event_field *field;
- field = key_field->field;
- if (field->filter_type == FILTER_DYN_STRING ||
- field->filter_type == FILTER_RDYN_STRING)
- size = *(u32 *)(rec + field->offset) >> 16;
- else if (field->filter_type == FILTER_STATIC_STRING)
- size = field->size;
+ if (key_field->flags & HIST_FIELD_FL_COMM) {
+ size = strlen((char *)key);
+ } else {
+ struct ftrace_event_field *field;
+
+ field = key_field->field;
+ if (field->filter_type == FILTER_DYN_STRING ||
+ field->filter_type == FILTER_RDYN_STRING)
+ size = *(u32 *)(rec + field->offset) >> 16;
+ else if (field->filter_type == FILTER_STATIC_STRING)
+ size = field->size;
+ }
/* ensure NULL-termination */
if (size > key_field->size - 1)
size = key_field->size - 1;
-
- strncpy(compound_key + key_field->offset, (char *)key, size);
- } else
- memcpy(compound_key + key_field->offset, key, size);
+ }
+ memcpy(compound_key + key_field->offset, key, size);
}
static void
@@ -5249,17 +5275,94 @@ hist_trigger_actions(struct hist_trigger_data *hist_data,
}
}
+/*
+ * The hist_pad structure is used to save information to create
+ * a histogram from the histogram trigger. It's too big to store
+ * on the stack, so when the histogram trigger is initialized
+ * a percpu array of 4 hist_pad structures is allocated.
+ * This will cover every context from normal, softirq, irq and NMI
+ * in the very unlikely event that a tigger happens at each of
+ * these contexts and interrupts a currently active trigger.
+ */
+struct hist_pad {
+ unsigned long entries[HIST_STACKTRACE_DEPTH];
+ u64 var_ref_vals[TRACING_MAP_VARS_MAX];
+ char compound_key[HIST_KEY_SIZE_MAX];
+};
+
+static struct hist_pad __percpu *hist_pads;
+static DEFINE_PER_CPU(int, hist_pad_cnt);
+static refcount_t hist_pad_ref;
+
+/* One hist_pad for every context (normal, softirq, irq, NMI) */
+#define MAX_HIST_CNT 4
+
+static int alloc_hist_pad(void)
+{
+ lockdep_assert_held(&event_mutex);
+
+ if (refcount_read(&hist_pad_ref)) {
+ refcount_inc(&hist_pad_ref);
+ return 0;
+ }
+
+ hist_pads = __alloc_percpu(sizeof(struct hist_pad) * MAX_HIST_CNT,
+ __alignof__(struct hist_pad));
+ if (!hist_pads)
+ return -ENOMEM;
+
+ refcount_set(&hist_pad_ref, 1);
+ return 0;
+}
+
+static void free_hist_pad(void)
+{
+ lockdep_assert_held(&event_mutex);
+
+ if (!refcount_dec_and_test(&hist_pad_ref))
+ return;
+
+ free_percpu(hist_pads);
+ hist_pads = NULL;
+}
+
+static struct hist_pad *get_hist_pad(void)
+{
+ struct hist_pad *hist_pad;
+ int cnt;
+
+ if (WARN_ON_ONCE(!hist_pads))
+ return NULL;
+
+ preempt_disable();
+
+ hist_pad = per_cpu_ptr(hist_pads, smp_processor_id());
+
+ if (this_cpu_read(hist_pad_cnt) == MAX_HIST_CNT) {
+ preempt_enable();
+ return NULL;
+ }
+
+ cnt = this_cpu_inc_return(hist_pad_cnt) - 1;
+
+ return &hist_pad[cnt];
+}
+
+static void put_hist_pad(void)
+{
+ this_cpu_dec(hist_pad_cnt);
+ preempt_enable();
+}
+
static void event_hist_trigger(struct event_trigger_data *data,
struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe)
{
struct hist_trigger_data *hist_data = data->private_data;
bool use_compound_key = (hist_data->n_keys > 1);
- unsigned long entries[HIST_STACKTRACE_DEPTH];
- u64 var_ref_vals[TRACING_MAP_VARS_MAX];
- char compound_key[HIST_KEY_SIZE_MAX];
struct tracing_map_elt *elt = NULL;
struct hist_field *key_field;
+ struct hist_pad *hist_pad;
u64 field_contents;
void *key = NULL;
unsigned int i;
@@ -5267,12 +5370,18 @@ static void event_hist_trigger(struct event_trigger_data *data,
if (unlikely(!rbe))
return;
- memset(compound_key, 0, hist_data->key_size);
+ hist_pad = get_hist_pad();
+ if (!hist_pad)
+ return;
+
+ memset(hist_pad->compound_key, 0, hist_data->key_size);
for_each_hist_key_field(i, hist_data) {
key_field = hist_data->fields[i];
if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
+ unsigned long *entries = hist_pad->entries;
+
memset(entries, 0, HIST_STACKTRACE_SIZE);
if (key_field->field) {
unsigned long *stack, n_entries;
@@ -5296,24 +5405,31 @@ static void event_hist_trigger(struct event_trigger_data *data,
}
if (use_compound_key)
- add_to_key(compound_key, key, key_field, rec);
+ add_to_key(hist_pad->compound_key, key, key_field, rec);
}
if (use_compound_key)
- key = compound_key;
+ key = hist_pad->compound_key;
if (hist_data->n_var_refs &&
- !resolve_var_refs(hist_data, key, var_ref_vals, false))
- return;
+ !resolve_var_refs(hist_data, key, hist_pad->var_ref_vals, false))
+ goto out;
elt = tracing_map_insert(hist_data->map, key);
if (!elt)
- return;
+ goto out;
+
+ hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, hist_pad->var_ref_vals);
- hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, var_ref_vals);
+ if (resolve_var_refs(hist_data, key, hist_pad->var_ref_vals, true)) {
+ hist_trigger_actions(hist_data, elt, buffer, rec, rbe,
+ key, hist_pad->var_ref_vals);
+ }
+
+ hist_poll_wakeup();
- if (resolve_var_refs(hist_data, key, var_ref_vals, true))
- hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals);
+ out:
+ put_hist_pad();
}
static void hist_trigger_stacktrace_print(struct seq_file *m,
@@ -5593,49 +5709,137 @@ static void hist_trigger_show(struct seq_file *m,
n_entries, (u64)atomic64_read(&hist_data->map->drops));
}
+struct hist_file_data {
+ struct file *file;
+ u64 last_read;
+ u64 last_act;
+};
+
+static u64 get_hist_hit_count(struct trace_event_file *event_file)
+{
+ struct hist_trigger_data *hist_data;
+ struct event_trigger_data *data;
+ u64 ret = 0;
+
+ list_for_each_entry(data, &event_file->triggers, list) {
+ if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ hist_data = data->private_data;
+ ret += atomic64_read(&hist_data->map->hits);
+ }
+ }
+ return ret;
+}
+
static int hist_show(struct seq_file *m, void *v)
{
+ struct hist_file_data *hist_file = m->private;
struct event_trigger_data *data;
struct trace_event_file *event_file;
- int n = 0, ret = 0;
+ int n = 0;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
- event_file = event_file_data(m->private);
- if (unlikely(!event_file)) {
- ret = -ENODEV;
- goto out_unlock;
- }
+ event_file = event_file_file(hist_file->file);
+ if (unlikely(!event_file))
+ return -ENODEV;
list_for_each_entry(data, &event_file->triggers, list) {
if (data->cmd_ops->trigger_type == ETT_EVENT_HIST)
hist_trigger_show(m, data, n++);
}
+ hist_file->last_read = get_hist_hit_count(event_file);
+ /*
+ * Update last_act too so that poll()/POLLPRI can wait for the next
+ * event after any syscall on hist file.
+ */
+ hist_file->last_act = hist_file->last_read;
+
+ return 0;
+}
+
+static __poll_t event_hist_poll(struct file *file, struct poll_table_struct *wait)
+{
+ struct trace_event_file *event_file;
+ struct seq_file *m = file->private_data;
+ struct hist_file_data *hist_file = m->private;
+ __poll_t ret = 0;
+ u64 cnt;
+
+ guard(mutex)(&event_mutex);
- out_unlock:
- mutex_unlock(&event_mutex);
+ event_file = event_file_data(file);
+ if (!event_file)
+ return EPOLLERR;
+
+ hist_poll_wait(file, wait);
+
+ cnt = get_hist_hit_count(event_file);
+ if (hist_file->last_read != cnt)
+ ret |= EPOLLIN | EPOLLRDNORM;
+ if (hist_file->last_act != cnt) {
+ hist_file->last_act = cnt;
+ ret |= EPOLLPRI;
+ }
return ret;
}
+static int event_hist_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m = file->private_data;
+ struct hist_file_data *hist_file = m->private;
+
+ kfree(hist_file);
+ return tracing_single_release_file_tr(inode, file);
+}
+
static int event_hist_open(struct inode *inode, struct file *file)
{
+ struct trace_event_file *event_file;
+ struct hist_file_data *hist_file;
int ret;
ret = tracing_open_file_tr(inode, file);
if (ret)
return ret;
+ guard(mutex)(&event_mutex);
+
+ event_file = event_file_data(file);
+ if (!event_file) {
+ ret = -ENODEV;
+ goto err;
+ }
+
+ hist_file = kzalloc(sizeof(*hist_file), GFP_KERNEL);
+ if (!hist_file) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ hist_file->file = file;
+ hist_file->last_act = get_hist_hit_count(event_file);
+
/* Clear private_data to avoid warning in single_open() */
file->private_data = NULL;
- return single_open(file, hist_show, file);
+ ret = single_open(file, hist_show, hist_file);
+ if (ret) {
+ kfree(hist_file);
+ goto err;
+ }
+
+ return 0;
+err:
+ tracing_release_file_tr(inode, file);
+ return ret;
}
const struct file_operations event_hist_fops = {
.open = event_hist_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = tracing_single_release_file_tr,
+ .release = event_hist_release,
+ .poll = event_hist_poll,
};
#ifdef CONFIG_HIST_TRIGGERS_DEBUG
@@ -5876,25 +6080,19 @@ static int hist_debug_show(struct seq_file *m, void *v)
{
struct event_trigger_data *data;
struct trace_event_file *event_file;
- int n = 0, ret = 0;
+ int n = 0;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
- event_file = event_file_data(m->private);
- if (unlikely(!event_file)) {
- ret = -ENODEV;
- goto out_unlock;
- }
+ event_file = event_file_file(m->private);
+ if (unlikely(!event_file))
+ return -ENODEV;
list_for_each_entry(data, &event_file->triggers, list) {
if (data->cmd_ops->trigger_type == ETT_EVENT_HIST)
hist_trigger_debug_show(m, data, n++);
}
-
- out_unlock:
- mutex_unlock(&event_mutex);
-
- return ret;
+ return 0;
}
static int event_hist_debug_open(struct inode *inode, struct file *file)
@@ -5907,7 +6105,10 @@ static int event_hist_debug_open(struct inode *inode, struct file *file)
/* Clear private_data to avoid warning in single_open() */
file->private_data = NULL;
- return single_open(file, hist_debug_show, file);
+ ret = single_open(file, hist_debug_show, file);
+ if (ret)
+ tracing_release_file_tr(inode, file);
+ return ret;
}
const struct file_operations event_hist_debug_fops = {
@@ -5927,6 +6128,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
if (hist_field->flags & HIST_FIELD_FL_CPU)
seq_puts(m, "common_cpu");
+ if (hist_field->flags & HIST_FIELD_FL_COMM)
+ seq_puts(m, "common_comm");
else if (hist_field->flags & HIST_FIELD_FL_CONST)
seq_printf(m, "%llu", hist_field->constant);
else if (field_name) {
@@ -6073,6 +6276,9 @@ static int event_hist_trigger_init(struct event_trigger_data *data)
{
struct hist_trigger_data *hist_data = data->private_data;
+ if (alloc_hist_pad() < 0)
+ return -ENOMEM;
+
if (!data->ref && hist_data->attrs->name)
save_named_trigger(hist_data->attrs->name, data);
@@ -6117,9 +6323,10 @@ static void event_hist_trigger_free(struct event_trigger_data *data)
destroy_hist_data(hist_data);
}
+ free_hist_pad();
}
-static struct event_trigger_ops event_hist_trigger_ops = {
+static const struct event_trigger_ops event_hist_trigger_ops = {
.trigger = event_hist_trigger,
.print = event_hist_trigger_print,
.init = event_hist_trigger_init,
@@ -6132,9 +6339,7 @@ static int event_hist_trigger_named_init(struct event_trigger_data *data)
save_named_trigger(data->named_data->name, data);
- event_hist_trigger_init(data->named_data);
-
- return 0;
+ return event_hist_trigger_init(data->named_data);
}
static void event_hist_trigger_named_free(struct event_trigger_data *data)
@@ -6151,15 +6356,15 @@ static void event_hist_trigger_named_free(struct event_trigger_data *data)
}
}
-static struct event_trigger_ops event_hist_trigger_named_ops = {
+static const struct event_trigger_ops event_hist_trigger_named_ops = {
.trigger = event_hist_trigger,
.print = event_hist_trigger_print,
.init = event_hist_trigger_named_init,
.free = event_hist_trigger_named_free,
};
-static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd,
- char *param)
+static const struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd,
+ char *param)
{
return &event_hist_trigger_ops;
}
@@ -6621,7 +6826,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
return PTR_ERR(hist_data);
}
- trigger_data = event_trigger_alloc(cmd_ops, cmd, param, hist_data);
+ trigger_data = trigger_data_alloc(cmd_ops, cmd, param, hist_data);
if (!trigger_data) {
ret = -ENOMEM;
goto out_free;
@@ -6652,27 +6857,27 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
if (existing_hist_update_only(glob, trigger_data, file))
goto out_free;
- ret = event_trigger_register(cmd_ops, file, glob, trigger_data);
- if (ret < 0)
- goto out_free;
+ if (!get_named_trigger_data(trigger_data)) {
- if (get_named_trigger_data(trigger_data))
- goto enable;
+ ret = create_actions(hist_data);
+ if (ret)
+ goto out_free;
- ret = create_actions(hist_data);
- if (ret)
- goto out_unreg;
+ if (has_hist_vars(hist_data) || hist_data->n_var_refs) {
+ ret = save_hist_vars(hist_data);
+ if (ret)
+ goto out_free;
+ }
- if (has_hist_vars(hist_data) || hist_data->n_var_refs) {
- ret = save_hist_vars(hist_data);
+ ret = tracing_map_init(hist_data->map);
if (ret)
- goto out_unreg;
+ goto out_free;
}
- ret = tracing_map_init(hist_data->map);
- if (ret)
- goto out_unreg;
-enable:
+ ret = event_trigger_register(cmd_ops, file, glob, trigger_data);
+ if (ret < 0)
+ goto out_free;
+
ret = hist_trigger_enable(trigger_data, file);
if (ret)
goto out_unreg;
@@ -6754,38 +6959,38 @@ hist_enable_count_trigger(struct event_trigger_data *data,
hist_enable_trigger(data, buffer, rec, event);
}
-static struct event_trigger_ops hist_enable_trigger_ops = {
+static const struct event_trigger_ops hist_enable_trigger_ops = {
.trigger = hist_enable_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
-static struct event_trigger_ops hist_enable_count_trigger_ops = {
+static const struct event_trigger_ops hist_enable_count_trigger_ops = {
.trigger = hist_enable_count_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
-static struct event_trigger_ops hist_disable_trigger_ops = {
+static const struct event_trigger_ops hist_disable_trigger_ops = {
.trigger = hist_enable_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
-static struct event_trigger_ops hist_disable_count_trigger_ops = {
+static const struct event_trigger_ops hist_disable_count_trigger_ops = {
.trigger = hist_enable_count_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
-static struct event_trigger_ops *
+static const struct event_trigger_ops *
hist_enable_get_trigger_ops(char *cmd, char *param)
{
- struct event_trigger_ops *ops;
+ const struct event_trigger_ops *ops;
bool enable;
enable = (strcmp(cmd, ENABLE_HIST_STR) == 0);
diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c
index 8650562bdaa9..a8f076809db4 100644
--- a/kernel/trace/trace_events_inject.c
+++ b/kernel/trace/trace_events_inject.c
@@ -299,7 +299,7 @@ event_inject_write(struct file *filp, const char __user *ubuf, size_t cnt,
strim(buf);
mutex_lock(&event_mutex);
- file = event_file_data(filp);
+ file = event_file_file(filp);
if (file) {
call = file->event_call;
size = parse_entry(buf, call, &entry);
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index c82b401a294d..33cfbd4ed76d 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -49,16 +49,11 @@ static char *last_cmd;
static int errpos(const char *str)
{
- int ret = 0;
-
- mutex_lock(&lastcmd_mutex);
+ guard(mutex)(&lastcmd_mutex);
if (!str || !last_cmd)
- goto out;
+ return 0;
- ret = err_pos(last_cmd, str);
- out:
- mutex_unlock(&lastcmd_mutex);
- return ret;
+ return err_pos(last_cmd, str);
}
static void last_cmd_set(const char *str)
@@ -74,14 +69,12 @@ static void last_cmd_set(const char *str)
static void synth_err(u8 err_type, u16 err_pos)
{
- mutex_lock(&lastcmd_mutex);
+ guard(mutex)(&lastcmd_mutex);
if (!last_cmd)
- goto out;
+ return;
tracing_log_err(NULL, "synthetic_events", last_cmd, err_text,
err_type, err_pos);
- out:
- mutex_unlock(&lastcmd_mutex);
}
static int create_synth_event(const char *raw_command);
@@ -214,7 +207,7 @@ static int synth_field_string_size(char *type)
if (len == 0)
return 0; /* variable-length string */
- strncpy(buf, start, len);
+ memcpy(buf, start, len);
buf[len] = '\0';
err = kstrtouint(buf, 0, &size);
@@ -312,7 +305,7 @@ static const char *synth_field_fmt(char *type)
else if (strcmp(type, "gfp_t") == 0)
fmt = "%x";
else if (synth_field_is_string(type))
- fmt = "%.*s";
+ fmt = "%s";
else if (synth_field_is_stack(type))
fmt = "%s";
@@ -377,7 +370,6 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
union trace_synth_field *data = &entry->fields[n_u64];
trace_seq_printf(s, print_fmt, se->fields[i]->name,
- STR_VAR_LEN_MAX,
(char *)entry + data->as_dynamic.offset,
i == se->n_fields - 1 ? "" : " ");
n_u64++;
@@ -619,7 +611,7 @@ static int __set_synth_event_print_fmt(struct synth_event *event,
fmt = synth_field_fmt(event->fields[i]->type);
pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s",
event->fields[i]->name, fmt,
- i == event->n_fields - 1 ? "" : ", ");
+ i == event->n_fields - 1 ? "" : " ");
}
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
@@ -859,6 +851,38 @@ static struct trace_event_fields synth_event_fields_array[] = {
{}
};
+static int synth_event_reg(struct trace_event_call *call,
+ enum trace_reg type, void *data)
+{
+ struct synth_event *event = container_of(call, struct synth_event, call);
+
+ switch (type) {
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+#endif
+ case TRACE_REG_REGISTER:
+ if (!try_module_get(event->mod))
+ return -EBUSY;
+ break;
+ default:
+ break;
+ }
+
+ int ret = trace_event_reg(call, type, data);
+
+ switch (type) {
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_UNREGISTER:
+#endif
+ case TRACE_REG_UNREGISTER:
+ module_put(event->mod);
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
static int register_synth_event(struct synth_event *event)
{
struct trace_event_call *call = &event->call;
@@ -888,7 +912,7 @@ static int register_synth_event(struct synth_event *event)
goto out;
}
call->flags = TRACE_EVENT_FL_TRACEPOINT;
- call->class->reg = trace_event_reg;
+ call->class->reg = synth_event_reg;
call->class->probe = trace_event_raw_event_synth;
call->data = event;
call->tp = event->tp;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 4bec043c8690..cbfc306c0159 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -159,7 +159,7 @@ static void *trigger_start(struct seq_file *m, loff_t *pos)
/* ->stop() is called even if ->start() fails */
mutex_lock(&event_mutex);
- event_file = event_file_data(m->private);
+ event_file = event_file_file(m->private);
if (unlikely(!event_file))
return ERR_PTR(-ENODEV);
@@ -211,12 +211,10 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file)
if (ret)
return ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
- if (unlikely(!event_file_data(file))) {
- mutex_unlock(&event_mutex);
+ if (unlikely(!event_file_file(file)))
return -ENODEV;
- }
if ((file->f_mode & FMODE_WRITE) &&
(file->f_flags & O_TRUNC)) {
@@ -239,8 +237,6 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file)
}
}
- mutex_unlock(&event_mutex);
-
return ret;
}
@@ -248,7 +244,6 @@ int trigger_process_regex(struct trace_event_file *file, char *buff)
{
char *command, *next;
struct event_command *p;
- int ret = -EINVAL;
next = buff = skip_spaces(buff);
command = strsep(&next, ": \t");
@@ -259,17 +254,14 @@ int trigger_process_regex(struct trace_event_file *file, char *buff)
}
command = (command[0] != '!') ? command : command + 1;
- mutex_lock(&trigger_cmd_mutex);
+ guard(mutex)(&trigger_cmd_mutex);
+
list_for_each_entry(p, &trigger_commands, list) {
- if (strcmp(p->name, command) == 0) {
- ret = p->parse(p, file, buff, command, next);
- goto out_unlock;
- }
+ if (strcmp(p->name, command) == 0)
+ return p->parse(p, file, buff, command, next);
}
- out_unlock:
- mutex_unlock(&trigger_cmd_mutex);
- return ret;
+ return -EINVAL;
}
static ssize_t event_trigger_regex_write(struct file *file,
@@ -278,7 +270,7 @@ static ssize_t event_trigger_regex_write(struct file *file,
{
struct trace_event_file *event_file;
ssize_t ret;
- char *buf;
+ char *buf __free(kfree) = NULL;
if (!cnt)
return 0;
@@ -292,24 +284,18 @@ static ssize_t event_trigger_regex_write(struct file *file,
strim(buf);
- mutex_lock(&event_mutex);
- event_file = event_file_data(file);
- if (unlikely(!event_file)) {
- mutex_unlock(&event_mutex);
- kfree(buf);
+ guard(mutex)(&event_mutex);
+
+ event_file = event_file_file(file);
+ if (unlikely(!event_file))
return -ENODEV;
- }
- ret = trigger_process_regex(event_file, buf);
- mutex_unlock(&event_mutex);
- kfree(buf);
+ ret = trigger_process_regex(event_file, buf);
if (ret < 0)
- goto out;
+ return ret;
*ppos += cnt;
- ret = cnt;
- out:
- return ret;
+ return cnt;
}
static int event_trigger_regex_release(struct inode *inode, struct file *file)
@@ -359,20 +345,16 @@ const struct file_operations event_trigger_fops = {
__init int register_event_command(struct event_command *cmd)
{
struct event_command *p;
- int ret = 0;
- mutex_lock(&trigger_cmd_mutex);
+ guard(mutex)(&trigger_cmd_mutex);
+
list_for_each_entry(p, &trigger_commands, list) {
- if (strcmp(cmd->name, p->name) == 0) {
- ret = -EBUSY;
- goto out_unlock;
- }
+ if (strcmp(cmd->name, p->name) == 0)
+ return -EBUSY;
}
list_add(&cmd->list, &trigger_commands);
- out_unlock:
- mutex_unlock(&trigger_cmd_mutex);
- return ret;
+ return 0;
}
/*
@@ -382,20 +364,17 @@ __init int register_event_command(struct event_command *cmd)
__init int unregister_event_command(struct event_command *cmd)
{
struct event_command *p, *n;
- int ret = -ENODEV;
- mutex_lock(&trigger_cmd_mutex);
+ guard(mutex)(&trigger_cmd_mutex);
+
list_for_each_entry_safe(p, n, &trigger_commands, list) {
if (strcmp(cmd->name, p->name) == 0) {
- ret = 0;
list_del_init(&p->list);
- goto out_unlock;
+ return 0;
}
}
- out_unlock:
- mutex_unlock(&trigger_cmd_mutex);
- return ret;
+ return -ENODEV;
}
/**
@@ -573,16 +552,14 @@ static int register_trigger(char *glob,
lockdep_assert_held(&event_mutex);
list_for_each_entry(test, &file->triggers, list) {
- if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) {
- ret = -EEXIST;
- goto out;
- }
+ if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type)
+ return -EEXIST;
}
if (data->ops->init) {
ret = data->ops->init(data);
if (ret < 0)
- goto out;
+ return ret;
}
list_add_rcu(&data->list, &file->triggers);
@@ -593,7 +570,6 @@ static int register_trigger(char *glob,
list_del_rcu(&data->list);
update_cond_flag(file);
}
-out:
return ret;
}
@@ -791,7 +767,7 @@ int event_trigger_separate_filter(char *param_and_filter, char **param,
if (!param_and_filter) {
if (param_required)
ret = -EINVAL;
- goto out;
+ return ret;
}
/*
@@ -802,7 +778,7 @@ int event_trigger_separate_filter(char *param_and_filter, char **param,
*/
if (!param_required && param_and_filter && !isdigit(param_and_filter[0])) {
*filter = param_and_filter;
- goto out;
+ return ret;
}
/*
@@ -820,12 +796,11 @@ int event_trigger_separate_filter(char *param_and_filter, char **param,
if (!**filter)
*filter = NULL;
}
-out:
return ret;
}
/**
- * event_trigger_alloc - allocate and init event_trigger_data for a trigger
+ * trigger_data_alloc - allocate and init event_trigger_data for a trigger
* @cmd_ops: The event_command operations for the trigger
* @cmd: The cmd string
* @param: The param string
@@ -836,17 +811,17 @@ out:
* trigger_ops to assign to the event_trigger_data. @private_data can
* also be passed in and associated with the event_trigger_data.
*
- * Use event_trigger_free() to free an event_trigger_data object.
+ * Use trigger_data_free() to free an event_trigger_data object.
*
* Return: The trigger_data object success, NULL otherwise
*/
-struct event_trigger_data *event_trigger_alloc(struct event_command *cmd_ops,
- char *cmd,
- char *param,
- void *private_data)
+struct event_trigger_data *trigger_data_alloc(struct event_command *cmd_ops,
+ char *cmd,
+ char *param,
+ void *private_data)
{
struct event_trigger_data *trigger_data;
- struct event_trigger_ops *trigger_ops;
+ const struct event_trigger_ops *trigger_ops;
trigger_ops = cmd_ops->get_trigger_ops(cmd, param);
@@ -1010,15 +985,14 @@ event_trigger_parse(struct event_command *cmd_ops,
return ret;
ret = -ENOMEM;
- trigger_data = event_trigger_alloc(cmd_ops, cmd, param, file);
+ trigger_data = trigger_data_alloc(cmd_ops, cmd, param, file);
if (!trigger_data)
- goto out;
+ return ret;
if (remove) {
event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
- kfree(trigger_data);
- ret = 0;
- goto out;
+ trigger_data_free(trigger_data);
+ return 0;
}
ret = event_trigger_parse_num(param, trigger_data);
@@ -1038,13 +1012,12 @@ event_trigger_parse(struct event_command *cmd_ops,
/* Down the counter of trigger_data or free it if not used anymore */
event_trigger_free(trigger_data);
- out:
return ret;
out_free:
event_trigger_reset_filter(cmd_ops, trigger_data);
- kfree(trigger_data);
- goto out;
+ trigger_data_free(trigger_data);
+ return ret;
}
/**
@@ -1078,10 +1051,10 @@ int set_trigger_filter(char *filter_str,
s = strsep(&filter_str, " \t");
if (!strlen(s) || strcmp(s, "if") != 0)
- goto out;
+ return ret;
if (!filter_str)
- goto out;
+ return ret;
/* The filter is for the 'trigger' event, not the triggered event */
ret = create_event_filter(file->tr, file->event_call,
@@ -1125,7 +1098,6 @@ int set_trigger_filter(char *filter_str,
ret = -ENOMEM;
}
}
- out:
return ret;
}
@@ -1388,38 +1360,38 @@ traceoff_trigger_print(struct seq_file *m, struct event_trigger_data *data)
data->filter_str);
}
-static struct event_trigger_ops traceon_trigger_ops = {
+static const struct event_trigger_ops traceon_trigger_ops = {
.trigger = traceon_trigger,
.print = traceon_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
-static struct event_trigger_ops traceon_count_trigger_ops = {
+static const struct event_trigger_ops traceon_count_trigger_ops = {
.trigger = traceon_count_trigger,
.print = traceon_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
-static struct event_trigger_ops traceoff_trigger_ops = {
+static const struct event_trigger_ops traceoff_trigger_ops = {
.trigger = traceoff_trigger,
.print = traceoff_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
-static struct event_trigger_ops traceoff_count_trigger_ops = {
+static const struct event_trigger_ops traceoff_count_trigger_ops = {
.trigger = traceoff_count_trigger,
.print = traceoff_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
-static struct event_trigger_ops *
+static const struct event_trigger_ops *
onoff_get_trigger_ops(char *cmd, char *param)
{
- struct event_trigger_ops *ops;
+ const struct event_trigger_ops *ops;
/* we register both traceon and traceoff to this callback */
if (strcmp(cmd, "traceon") == 0)
@@ -1512,21 +1484,21 @@ snapshot_trigger_print(struct seq_file *m, struct event_trigger_data *data)
data->filter_str);
}
-static struct event_trigger_ops snapshot_trigger_ops = {
+static const struct event_trigger_ops snapshot_trigger_ops = {
.trigger = snapshot_trigger,
.print = snapshot_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
-static struct event_trigger_ops snapshot_count_trigger_ops = {
+static const struct event_trigger_ops snapshot_count_trigger_ops = {
.trigger = snapshot_count_trigger,
.print = snapshot_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
-static struct event_trigger_ops *
+static const struct event_trigger_ops *
snapshot_get_trigger_ops(char *cmd, char *param)
{
return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops;
@@ -1581,7 +1553,7 @@ stacktrace_trigger(struct event_trigger_data *data,
struct trace_event_file *file = data->private_data;
if (file)
- __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP);
+ __trace_stack(file->tr, tracing_gen_ctx_dec(), STACK_SKIP);
else
trace_dump_stack(STACK_SKIP);
}
@@ -1607,21 +1579,21 @@ stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data)
data->filter_str);
}
-static struct event_trigger_ops stacktrace_trigger_ops = {
+static const struct event_trigger_ops stacktrace_trigger_ops = {
.trigger = stacktrace_trigger,
.print = stacktrace_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
-static struct event_trigger_ops stacktrace_count_trigger_ops = {
+static const struct event_trigger_ops stacktrace_count_trigger_ops = {
.trigger = stacktrace_count_trigger,
.print = stacktrace_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
-static struct event_trigger_ops *
+static const struct event_trigger_ops *
stacktrace_get_trigger_ops(char *cmd, char *param)
{
return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops;
@@ -1732,28 +1704,28 @@ void event_enable_trigger_free(struct event_trigger_data *data)
}
}
-static struct event_trigger_ops event_enable_trigger_ops = {
+static const struct event_trigger_ops event_enable_trigger_ops = {
.trigger = event_enable_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
-static struct event_trigger_ops event_enable_count_trigger_ops = {
+static const struct event_trigger_ops event_enable_count_trigger_ops = {
.trigger = event_enable_count_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
-static struct event_trigger_ops event_disable_trigger_ops = {
+static const struct event_trigger_ops event_disable_trigger_ops = {
.trigger = event_enable_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
-static struct event_trigger_ops event_disable_count_trigger_ops = {
+static const struct event_trigger_ops event_disable_count_trigger_ops = {
.trigger = event_enable_count_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
@@ -1793,7 +1765,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
ret = -EINVAL;
event_enable_file = find_event_file(tr, system, event);
if (!event_enable_file)
- goto out;
+ return ret;
#ifdef CONFIG_HIST_TRIGGERS
hist = ((strcmp(cmd, ENABLE_HIST_STR) == 0) ||
@@ -1808,16 +1780,16 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL);
if (!enable_data)
- goto out;
+ return ret;
enable_data->hist = hist;
enable_data->enable = enable;
enable_data->file = event_enable_file;
- trigger_data = event_trigger_alloc(cmd_ops, cmd, param, enable_data);
+ trigger_data = trigger_data_alloc(cmd_ops, cmd, param, enable_data);
if (!trigger_data) {
kfree(enable_data);
- goto out;
+ return ret;
}
if (remove) {
@@ -1825,7 +1797,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
kfree(trigger_data);
kfree(enable_data);
ret = 0;
- goto out;
+ return ret;
}
/* Up the trigger_data count to make sure nothing frees it on failure */
@@ -1855,7 +1827,6 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
goto out_disable;
event_trigger_free(trigger_data);
- out:
return ret;
out_disable:
trace_event_enable_disable(event_enable_file, 0, 1);
@@ -1866,7 +1837,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
event_trigger_free(trigger_data);
kfree(enable_data);
- goto out;
+ return ret;
}
int event_enable_register_trigger(char *glob,
@@ -1886,15 +1857,14 @@ int event_enable_register_trigger(char *glob,
(test->cmd_ops->trigger_type ==
data->cmd_ops->trigger_type) &&
(test_enable_data->file == enable_data->file)) {
- ret = -EEXIST;
- goto out;
+ return -EEXIST;
}
}
if (data->ops->init) {
ret = data->ops->init(data);
if (ret < 0)
- goto out;
+ return ret;
}
list_add_rcu(&data->list, &file->triggers);
@@ -1905,7 +1875,6 @@ int event_enable_register_trigger(char *glob,
list_del_rcu(&data->list);
update_cond_flag(file);
}
-out:
return ret;
}
@@ -1937,10 +1906,10 @@ void event_enable_unregister_trigger(char *glob,
data->ops->free(data);
}
-static struct event_trigger_ops *
+static const struct event_trigger_ops *
event_enable_get_trigger_ops(char *cmd, char *param)
{
- struct event_trigger_ops *ops;
+ const struct event_trigger_ops *ops;
bool enable;
#ifdef CONFIG_HIST_TRIGGERS
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 70d428c394b6..af42aaa3d172 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -455,7 +455,7 @@ static void user_event_enabler_fault_fixup(struct work_struct *work)
if (ret && ret != -ENOENT) {
struct user_event *user = enabler->event;
- pr_warn("user_events: Fault for mm: 0x%pK @ 0x%llx event: %s\n",
+ pr_warn("user_events: Fault for mm: 0x%p @ 0x%llx event: %s\n",
mm->mm, (unsigned long long)uaddr, EVENT_NAME(user));
}
@@ -1676,7 +1676,7 @@ static void update_enable_bit_for(struct user_event *user)
struct tracepoint *tp = &user->tracepoint;
char status = 0;
- if (atomic_read(&tp->key.enabled) > 0) {
+ if (static_key_enabled(&tp->key)) {
struct tracepoint_func *probe_func_ptr;
user_event_func_t probe_func;
@@ -1990,6 +1990,80 @@ static int user_event_set_tp_name(struct user_event *user)
}
/*
+ * Counts how many ';' without a trailing space are in the args.
+ */
+static int count_semis_no_space(char *args)
+{
+ int count = 0;
+
+ while ((args = strchr(args, ';'))) {
+ args++;
+
+ if (!isspace(*args))
+ count++;
+ }
+
+ return count;
+}
+
+/*
+ * Copies the arguments while ensuring all ';' have a trailing space.
+ */
+static char *insert_space_after_semis(char *args, int count)
+{
+ char *fixed, *pos;
+ int len;
+
+ len = strlen(args) + count;
+ fixed = kmalloc(len + 1, GFP_KERNEL);
+
+ if (!fixed)
+ return NULL;
+
+ pos = fixed;
+
+ /* Insert a space after ';' if there is no trailing space. */
+ while (*args) {
+ *pos = *args++;
+
+ if (*pos++ == ';' && !isspace(*args))
+ *pos++ = ' ';
+ }
+
+ *pos = '\0';
+
+ return fixed;
+}
+
+static char **user_event_argv_split(char *args, int *argc)
+{
+ char **split;
+ char *fixed;
+ int count;
+
+ /* Count how many ';' without a trailing space */
+ count = count_semis_no_space(args);
+
+ /* No fixup is required */
+ if (!count)
+ return argv_split(GFP_KERNEL, args, argc);
+
+ /* We must fixup 'field;field' to 'field; field' */
+ fixed = insert_space_after_semis(args, count);
+
+ if (!fixed)
+ return NULL;
+
+ /* We do a normal split afterwards */
+ split = argv_split(GFP_KERNEL, fixed, argc);
+
+ /* We can free since argv_split makes a copy */
+ kfree(fixed);
+
+ return split;
+}
+
+/*
* Parses the event name, arguments and flags then registers if successful.
* The name buffer lifetime is owned by this method for success cases only.
* Upon success the returned user_event has its ref count increased by 1.
@@ -2012,7 +2086,7 @@ static int user_event_parse(struct user_event_group *group, char *name,
return -EPERM;
if (args) {
- argv = argv_split(GFP_KERNEL, args, &argc);
+ argv = user_event_argv_split(args, &argc);
if (!argv)
return -ENOMEM;
@@ -2206,7 +2280,7 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
* It's possible key.enabled disables after this check, however
* we don't mind if a few events are included in this condition.
*/
- if (likely(atomic_read(&tp->key.enabled) > 0)) {
+ if (likely(static_key_enabled(&tp->key))) {
struct tracepoint_func *probe_func_ptr;
user_event_func_t probe_func;
struct iov_iter copy;
@@ -2719,11 +2793,8 @@ static int user_seq_show(struct seq_file *m, void *p)
seq_printf(m, "%s", EVENT_TP_NAME(user));
- if (status != 0)
- seq_puts(m, " #");
-
if (status != 0) {
- seq_puts(m, " Used by");
+ seq_puts(m, " # Used by");
if (status & EVENT_STATUS_FTRACE)
seq_puts(m, " ftrace");
if (status & EVENT_STATUS_PERF)
@@ -2811,7 +2882,7 @@ err:
return -ENODEV;
}
-static int set_max_user_events_sysctl(struct ctl_table *table, int write,
+static int set_max_user_events_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -2825,7 +2896,7 @@ static int set_max_user_events_sysctl(struct ctl_table *table, int write,
return ret;
}
-static struct ctl_table user_event_sysctls[] = {
+static const struct ctl_table user_event_sysctls[] = {
{
.procname = "user_events_max",
.data = &max_user_events,
@@ -2833,7 +2904,6 @@ static struct ctl_table user_event_sysctls[] = {
.mode = 0644,
.proc_handler = set_max_user_events_sysctl,
},
- {}
};
static int __init trace_events_user_init(void)
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index 4f4280815522..b40fa59159ac 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -21,6 +21,7 @@
#define FPROBE_EVENT_SYSTEM "fprobes"
#define TRACEPOINT_EVENT_SYSTEM "tracepoints"
#define RETHOOK_MAXACTIVE_MAX 4096
+#define TRACEPOINT_STUB ERR_PTR(-ENOENT)
static int trace_fprobe_create(const char *raw_command);
static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev);
@@ -133,7 +134,7 @@ static int
process_fetch_insn(struct fetch_insn *code, void *rec, void *edata,
void *dest, void *base)
{
- struct pt_regs *regs = rec;
+ struct ftrace_regs *fregs = rec;
unsigned long val;
int ret;
@@ -141,17 +142,17 @@ retry:
/* 1st stage: get value from context */
switch (code->op) {
case FETCH_OP_STACK:
- val = regs_get_kernel_stack_nth(regs, code->param);
+ val = ftrace_regs_get_kernel_stack_nth(fregs, code->param);
break;
case FETCH_OP_STACKP:
- val = kernel_stack_pointer(regs);
+ val = ftrace_regs_get_stack_pointer(fregs);
break;
case FETCH_OP_RETVAL:
- val = regs_return_value(regs);
+ val = ftrace_regs_get_return_value(fregs);
break;
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
case FETCH_OP_ARG:
- val = regs_get_kernel_argument(regs, code->param);
+ val = ftrace_regs_get_argument(fregs, code->param);
break;
case FETCH_OP_EDATA:
val = *(unsigned long *)((unsigned long)edata + code->offset);
@@ -174,7 +175,7 @@ NOKPROBE_SYMBOL(process_fetch_insn)
/* function entry handler */
static nokprobe_inline void
__fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
- struct pt_regs *regs,
+ struct ftrace_regs *fregs,
struct trace_event_file *trace_file)
{
struct fentry_trace_entry_head *entry;
@@ -188,41 +189,71 @@ __fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
if (trace_trigger_soft_disabled(trace_file))
return;
- dsize = __get_data_size(&tf->tp, regs, NULL);
+ dsize = __get_data_size(&tf->tp, fregs, NULL);
entry = trace_event_buffer_reserve(&fbuffer, trace_file,
sizeof(*entry) + tf->tp.size + dsize);
if (!entry)
return;
- fbuffer.regs = regs;
+ fbuffer.regs = ftrace_get_regs(fregs);
entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
entry->ip = entry_ip;
- store_trace_args(&entry[1], &tf->tp, regs, NULL, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tf->tp, fregs, NULL, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
}
static void
fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
- struct pt_regs *regs)
+ struct ftrace_regs *fregs)
{
struct event_file_link *link;
trace_probe_for_each_link_rcu(link, &tf->tp)
- __fentry_trace_func(tf, entry_ip, regs, link->file);
+ __fentry_trace_func(tf, entry_ip, fregs, link->file);
}
NOKPROBE_SYMBOL(fentry_trace_func);
+static nokprobe_inline
+void store_fprobe_entry_data(void *edata, struct trace_probe *tp, struct ftrace_regs *fregs)
+{
+ struct probe_entry_arg *earg = tp->entry_arg;
+ unsigned long val = 0;
+ int i;
+
+ if (!earg)
+ return;
+
+ for (i = 0; i < earg->size; i++) {
+ struct fetch_insn *code = &earg->code[i];
+
+ switch (code->op) {
+ case FETCH_OP_ARG:
+ val = ftrace_regs_get_argument(fregs, code->param);
+ break;
+ case FETCH_OP_ST_EDATA:
+ *(unsigned long *)((unsigned long)edata + code->offset) = val;
+ break;
+ case FETCH_OP_END:
+ goto end;
+ default:
+ break;
+ }
+ }
+end:
+ return;
+}
+
/* function exit handler */
static int trace_fprobe_entry_handler(struct fprobe *fp, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *entry_data)
{
struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp);
if (tf->tp.entry_arg)
- store_trace_entry_data(entry_data, &tf->tp, regs);
+ store_fprobe_entry_data(entry_data, &tf->tp, fregs);
return 0;
}
@@ -230,7 +261,7 @@ NOKPROBE_SYMBOL(trace_fprobe_entry_handler)
static nokprobe_inline void
__fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *entry_data, struct trace_event_file *trace_file)
{
struct fexit_trace_entry_head *entry;
@@ -244,60 +275,63 @@ __fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
if (trace_trigger_soft_disabled(trace_file))
return;
- dsize = __get_data_size(&tf->tp, regs, entry_data);
+ dsize = __get_data_size(&tf->tp, fregs, entry_data);
entry = trace_event_buffer_reserve(&fbuffer, trace_file,
sizeof(*entry) + tf->tp.size + dsize);
if (!entry)
return;
- fbuffer.regs = regs;
+ fbuffer.regs = ftrace_get_regs(fregs);
entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
entry->func = entry_ip;
entry->ret_ip = ret_ip;
- store_trace_args(&entry[1], &tf->tp, regs, entry_data, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tf->tp, fregs, entry_data, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
}
static void
fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs, void *entry_data)
+ unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data)
{
struct event_file_link *link;
trace_probe_for_each_link_rcu(link, &tf->tp)
- __fexit_trace_func(tf, entry_ip, ret_ip, regs, entry_data, link->file);
+ __fexit_trace_func(tf, entry_ip, ret_ip, fregs, entry_data, link->file);
}
NOKPROBE_SYMBOL(fexit_trace_func);
#ifdef CONFIG_PERF_EVENTS
static int fentry_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
- struct pt_regs *regs)
+ struct ftrace_regs *fregs)
{
struct trace_event_call *call = trace_probe_event_call(&tf->tp);
struct fentry_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
+ struct pt_regs *regs;
int rctx;
head = this_cpu_ptr(call->perf_events);
if (hlist_empty(head))
return 0;
- dsize = __get_data_size(&tf->tp, regs, NULL);
+ dsize = __get_data_size(&tf->tp, fregs, NULL);
__size = sizeof(*entry) + tf->tp.size + dsize;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- entry = perf_trace_buf_alloc(size, NULL, &rctx);
+ entry = perf_trace_buf_alloc(size, &regs, &rctx);
if (!entry)
return 0;
+ regs = ftrace_fill_perf_regs(fregs, regs);
+
entry->ip = entry_ip;
memset(&entry[1], 0, dsize);
- store_trace_args(&entry[1], &tf->tp, regs, NULL, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tf->tp, fregs, NULL, sizeof(*entry), dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
return 0;
@@ -306,31 +340,34 @@ NOKPROBE_SYMBOL(fentry_perf_func);
static void
fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *entry_data)
{
struct trace_event_call *call = trace_probe_event_call(&tf->tp);
struct fexit_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
+ struct pt_regs *regs;
int rctx;
head = this_cpu_ptr(call->perf_events);
if (hlist_empty(head))
return;
- dsize = __get_data_size(&tf->tp, regs, entry_data);
+ dsize = __get_data_size(&tf->tp, fregs, entry_data);
__size = sizeof(*entry) + tf->tp.size + dsize;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- entry = perf_trace_buf_alloc(size, NULL, &rctx);
+ entry = perf_trace_buf_alloc(size, &regs, &rctx);
if (!entry)
return;
+ regs = ftrace_fill_perf_regs(fregs, regs);
+
entry->func = entry_ip;
entry->ret_ip = ret_ip;
- store_trace_args(&entry[1], &tf->tp, regs, entry_data, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tf->tp, fregs, entry_data, sizeof(*entry), dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
}
@@ -338,33 +375,34 @@ NOKPROBE_SYMBOL(fexit_perf_func);
#endif /* CONFIG_PERF_EVENTS */
static int fentry_dispatcher(struct fprobe *fp, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *entry_data)
{
struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp);
int ret = 0;
if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE))
- fentry_trace_func(tf, entry_ip, regs);
+ fentry_trace_func(tf, entry_ip, fregs);
+
#ifdef CONFIG_PERF_EVENTS
if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE))
- ret = fentry_perf_func(tf, entry_ip, regs);
+ ret = fentry_perf_func(tf, entry_ip, fregs);
#endif
return ret;
}
NOKPROBE_SYMBOL(fentry_dispatcher);
static void fexit_dispatcher(struct fprobe *fp, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *entry_data)
{
struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp);
if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE))
- fexit_trace_func(tf, entry_ip, ret_ip, regs, entry_data);
+ fexit_trace_func(tf, entry_ip, ret_ip, fregs, entry_data);
#ifdef CONFIG_PERF_EVENTS
if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE))
- fexit_perf_func(tf, entry_ip, ret_ip, regs, entry_data);
+ fexit_perf_func(tf, entry_ip, ret_ip, fregs, entry_data);
#endif
}
NOKPROBE_SYMBOL(fexit_dispatcher);
@@ -378,6 +416,9 @@ static void free_trace_fprobe(struct trace_fprobe *tf)
}
}
+/* Since alloc_trace_fprobe() can return error, check the pointer is ERR too. */
+DEFINE_FREE(free_trace_fprobe, struct trace_fprobe *, if (!IS_ERR_OR_NULL(_T)) free_trace_fprobe(_T))
+
/*
* Allocate new trace_probe and initialize it (including fprobe).
*/
@@ -385,10 +426,10 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group,
const char *event,
const char *symbol,
struct tracepoint *tpoint,
- int maxactive,
+ struct module *mod,
int nargs, bool is_return)
{
- struct trace_fprobe *tf;
+ struct trace_fprobe *tf __free(free_trace_fprobe) = NULL;
int ret = -ENOMEM;
tf = kzalloc(struct_size(tf, tp.args, nargs), GFP_KERNEL);
@@ -397,7 +438,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group,
tf->symbol = kstrdup(symbol, GFP_KERNEL);
if (!tf->symbol)
- goto error;
+ return ERR_PTR(-ENOMEM);
if (is_return)
tf->fp.exit_handler = fexit_dispatcher;
@@ -405,17 +446,14 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group,
tf->fp.entry_handler = fentry_dispatcher;
tf->tpoint = tpoint;
- tf->fp.nr_maxactive = maxactive;
+ tf->mod = mod;
ret = trace_probe_init(&tf->tp, event, group, false, nargs);
if (ret < 0)
- goto error;
+ return ERR_PTR(ret);
dyn_event_init(&tf->devent, &trace_fprobe_ops);
- return tf;
-error:
- free_trace_fprobe(tf);
- return ERR_PTR(ret);
+ return_ptr(tf);
}
static struct trace_fprobe *find_trace_fprobe(const char *event,
@@ -672,6 +710,24 @@ static int unregister_fprobe_event(struct trace_fprobe *tf)
return trace_probe_unregister_event_call(&tf->tp);
}
+static int __regsiter_tracepoint_fprobe(struct trace_fprobe *tf)
+{
+ struct tracepoint *tpoint = tf->tpoint;
+ unsigned long ip = (unsigned long)tpoint->probestub;
+ int ret;
+
+ /*
+ * Here, we do 2 steps to enable fprobe on a tracepoint.
+ * At first, put __probestub_##TP function on the tracepoint
+ * and put a fprobe on the stub function.
+ */
+ ret = tracepoint_probe_register_prio_may_exist(tpoint,
+ tpoint->probestub, NULL, 0);
+ if (ret < 0)
+ return ret;
+ return register_fprobe_ips(&tf->fp, &ip, 1);
+}
+
/* Internal register function - just handle fprobe and flags */
static int __register_trace_fprobe(struct trace_fprobe *tf)
{
@@ -698,18 +754,12 @@ static int __register_trace_fprobe(struct trace_fprobe *tf)
tf->fp.flags |= FPROBE_FL_DISABLED;
if (trace_fprobe_is_tracepoint(tf)) {
- struct tracepoint *tpoint = tf->tpoint;
- unsigned long ip = (unsigned long)tpoint->probestub;
- /*
- * Here, we do 2 steps to enable fprobe on a tracepoint.
- * At first, put __probestub_##TP function on the tracepoint
- * and put a fprobe on the stub function.
- */
- ret = tracepoint_probe_register_prio_may_exist(tpoint,
- tpoint->probestub, NULL, 0);
- if (ret < 0)
- return ret;
- return register_fprobe_ips(&tf->fp, &ip, 1);
+
+ /* This tracepoint is not loaded yet */
+ if (tf->tpoint == TRACEPOINT_STUB)
+ return 0;
+
+ return __regsiter_tracepoint_fprobe(tf);
}
/* TODO: handle filter, nofilter or symbol list */
@@ -830,14 +880,12 @@ static int register_trace_fprobe(struct trace_fprobe *tf)
struct trace_fprobe *old_tf;
int ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
old_tf = find_trace_fprobe(trace_probe_name(&tf->tp),
trace_probe_group_name(&tf->tp));
- if (old_tf) {
- ret = append_trace_fprobe(tf, old_tf);
- goto end;
- }
+ if (old_tf)
+ return append_trace_fprobe(tf, old_tf);
/* Register new event */
ret = register_fprobe_event(tf);
@@ -847,7 +895,7 @@ static int register_trace_fprobe(struct trace_fprobe *tf)
trace_probe_log_err(0, EVENT_EXIST);
} else
pr_warn("Failed to register probe event(%d)\n", ret);
- goto end;
+ return ret;
}
/* Register fprobe */
@@ -857,46 +905,32 @@ static int register_trace_fprobe(struct trace_fprobe *tf)
else
dyn_event_add(&tf->devent, trace_probe_event_call(&tf->tp));
-end:
- mutex_unlock(&event_mutex);
return ret;
}
-#ifdef CONFIG_MODULES
-static int __tracepoint_probe_module_cb(struct notifier_block *self,
- unsigned long val, void *data)
+struct __find_tracepoint_cb_data {
+ const char *tp_name;
+ struct tracepoint *tpoint;
+ struct module *mod;
+};
+
+static void __find_tracepoint_module_cb(struct tracepoint *tp, struct module *mod, void *priv)
{
- struct tp_module *tp_mod = data;
- struct trace_fprobe *tf;
- struct dyn_event *pos;
+ struct __find_tracepoint_cb_data *data = priv;
- if (val != MODULE_STATE_GOING)
- return NOTIFY_DONE;
+ if (!data->tpoint && !strcmp(data->tp_name, tp->name)) {
+ /* If module is not specified, try getting module refcount. */
+ if (!data->mod && mod) {
+ /* If failed to get refcount, ignore this tracepoint. */
+ if (!try_module_get(mod))
+ return;
- mutex_lock(&event_mutex);
- for_each_trace_fprobe(tf, pos) {
- if (tp_mod->mod == tf->mod) {
- tracepoint_probe_unregister(tf->tpoint,
- tf->tpoint->probestub, NULL);
- tf->tpoint = NULL;
- tf->mod = NULL;
+ data->mod = mod;
}
+ data->tpoint = tp;
}
- mutex_unlock(&event_mutex);
-
- return NOTIFY_DONE;
}
-static struct notifier_block tracepoint_module_nb = {
- .notifier_call = __tracepoint_probe_module_cb,
-};
-#endif /* CONFIG_MODULES */
-
-struct __find_tracepoint_cb_data {
- const char *tp_name;
- struct tracepoint *tpoint;
-};
-
static void __find_tracepoint_cb(struct tracepoint *tp, void *priv)
{
struct __find_tracepoint_cb_data *data = priv;
@@ -905,17 +939,97 @@ static void __find_tracepoint_cb(struct tracepoint *tp, void *priv)
data->tpoint = tp;
}
-static struct tracepoint *find_tracepoint(const char *tp_name)
+/*
+ * Find a tracepoint from kernel and module. If the tracepoint is on the module,
+ * the module's refcount is incremented and returned as *@tp_mod. Thus, if it is
+ * not NULL, caller must call module_put(*tp_mod) after used the tracepoint.
+ */
+static struct tracepoint *find_tracepoint(const char *tp_name,
+ struct module **tp_mod)
{
struct __find_tracepoint_cb_data data = {
.tp_name = tp_name,
+ .mod = NULL,
};
for_each_kernel_tracepoint(__find_tracepoint_cb, &data);
+ if (!data.tpoint && IS_ENABLED(CONFIG_MODULES)) {
+ for_each_module_tracepoint(__find_tracepoint_module_cb, &data);
+ *tp_mod = data.mod;
+ }
+
+ return data.tpoint;
+}
+
+#ifdef CONFIG_MODULES
+static void reenable_trace_fprobe(struct trace_fprobe *tf)
+{
+ struct trace_probe *tp = &tf->tp;
+
+ list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
+ __enable_trace_fprobe(tf);
+ }
+}
+
+/*
+ * Find a tracepoint from specified module. In this case, this does not get the
+ * module's refcount. The caller must ensure the module is not freed.
+ */
+static struct tracepoint *find_tracepoint_in_module(struct module *mod,
+ const char *tp_name)
+{
+ struct __find_tracepoint_cb_data data = {
+ .tp_name = tp_name,
+ .mod = mod,
+ };
+
+ for_each_tracepoint_in_module(mod, __find_tracepoint_module_cb, &data);
return data.tpoint;
}
+static int __tracepoint_probe_module_cb(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct tp_module *tp_mod = data;
+ struct tracepoint *tpoint;
+ struct trace_fprobe *tf;
+ struct dyn_event *pos;
+
+ if (val != MODULE_STATE_GOING && val != MODULE_STATE_COMING)
+ return NOTIFY_DONE;
+
+ mutex_lock(&event_mutex);
+ for_each_trace_fprobe(tf, pos) {
+ if (val == MODULE_STATE_COMING && tf->tpoint == TRACEPOINT_STUB) {
+ tpoint = find_tracepoint_in_module(tp_mod->mod, tf->symbol);
+ if (tpoint) {
+ tf->tpoint = tpoint;
+ tf->mod = tp_mod->mod;
+ if (!WARN_ON_ONCE(__regsiter_tracepoint_fprobe(tf)) &&
+ trace_probe_is_enabled(&tf->tp))
+ reenable_trace_fprobe(tf);
+ }
+ } else if (val == MODULE_STATE_GOING && tp_mod->mod == tf->mod) {
+ unregister_fprobe(&tf->fp);
+ if (trace_fprobe_is_tracepoint(tf)) {
+ tracepoint_probe_unregister(tf->tpoint,
+ tf->tpoint->probestub, NULL);
+ tf->tpoint = TRACEPOINT_STUB;
+ tf->mod = NULL;
+ }
+ }
+ }
+ mutex_unlock(&event_mutex);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block tracepoint_module_nb = {
+ .notifier_call = __tracepoint_probe_module_cb,
+};
+#endif /* CONFIG_MODULES */
+
static int parse_symbol_and_return(int argc, const char *argv[],
char **symbol, bool *is_return,
bool is_tracepoint)
@@ -941,6 +1055,19 @@ static int parse_symbol_and_return(int argc, const char *argv[],
if (*is_return)
return 0;
+ if (is_tracepoint) {
+ tmp = *symbol;
+ while (*tmp && (isalnum(*tmp) || *tmp == '_'))
+ tmp++;
+ if (*tmp) {
+ /* find a wrong character. */
+ trace_probe_log_err(tmp - *symbol, BAD_TP_NAME);
+ kfree(*symbol);
+ *symbol = NULL;
+ return -EINVAL;
+ }
+ }
+
/* If there is $retval, this should be a return fprobe. */
for (i = 2; i < argc; i++) {
tmp = strstr(argv[i], "$retval");
@@ -948,6 +1075,8 @@ static int parse_symbol_and_return(int argc, const char *argv[],
if (is_tracepoint) {
trace_probe_log_set_index(i);
trace_probe_log_err(tmp - argv[i], RETVAL_ON_PROBE);
+ kfree(*symbol);
+ *symbol = NULL;
return -EINVAL;
}
*is_return = true;
@@ -957,7 +1086,10 @@ static int parse_symbol_and_return(int argc, const char *argv[],
return 0;
}
-static int __trace_fprobe_create(int argc, const char *argv[])
+DEFINE_FREE(module_put, struct module *, if (_T) module_put(_T))
+
+static int trace_fprobe_create_internal(int argc, const char *argv[],
+ struct traceprobe_parse_context *ctx)
{
/*
* Argument syntax:
@@ -983,22 +1115,20 @@ static int __trace_fprobe_create(int argc, const char *argv[])
* Type of args:
* FETCHARG:TYPE : use TYPE instead of unsigned long.
*/
- struct trace_fprobe *tf = NULL;
- int i, len, new_argc = 0, ret = 0;
+ struct trace_fprobe *tf __free(free_trace_fprobe) = NULL;
+ int i, new_argc = 0, ret = 0;
bool is_return = false;
- char *symbol = NULL;
+ char *symbol __free(kfree) = NULL;
const char *event = NULL, *group = FPROBE_EVENT_SYSTEM;
- const char **new_argv = NULL;
- int maxactive = 0;
+ const char **new_argv __free(kfree) = NULL;
char buf[MAX_EVENT_NAME_LEN];
char gbuf[MAX_EVENT_NAME_LEN];
char sbuf[KSYM_NAME_LEN];
char abuf[MAX_BTF_ARGS_LEN];
+ char *dbuf __free(kfree) = NULL;
bool is_tracepoint = false;
+ struct module *tp_mod __free(module_put) = NULL;
struct tracepoint *tpoint = NULL;
- struct traceprobe_parse_context ctx = {
- .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE,
- };
if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2)
return -ECANCELED;
@@ -1008,35 +1138,13 @@ static int __trace_fprobe_create(int argc, const char *argv[])
group = TRACEPOINT_EVENT_SYSTEM;
}
- trace_probe_log_init("trace_fprobe", argc, argv);
-
- event = strchr(&argv[0][1], ':');
- if (event)
- event++;
-
- if (isdigit(argv[0][1])) {
- if (event)
- len = event - &argv[0][1] - 1;
- else
- len = strlen(&argv[0][1]);
- if (len > MAX_EVENT_NAME_LEN - 1) {
- trace_probe_log_err(1, BAD_MAXACT);
- goto parse_error;
- }
- memcpy(buf, &argv[0][1], len);
- buf[len] = '\0';
- ret = kstrtouint(buf, 0, &maxactive);
- if (ret || !maxactive) {
+ if (argv[0][1] != '\0') {
+ if (argv[0][1] != ':') {
+ trace_probe_log_set_index(0);
trace_probe_log_err(1, BAD_MAXACT);
- goto parse_error;
- }
- /* fprobe rethook instances are iterated over via a list. The
- * maximum should stay reasonable.
- */
- if (maxactive > RETHOOK_MAXACTIVE_MAX) {
- trace_probe_log_err(1, MAXACT_TOO_BIG);
- goto parse_error;
+ return -EINVAL;
}
+ event = &argv[0][2];
}
trace_probe_log_set_index(1);
@@ -1044,20 +1152,14 @@ static int __trace_fprobe_create(int argc, const char *argv[])
/* a symbol(or tracepoint) must be specified */
ret = parse_symbol_and_return(argc, argv, &symbol, &is_return, is_tracepoint);
if (ret < 0)
- goto parse_error;
-
- if (!is_return && maxactive) {
- trace_probe_log_set_index(0);
- trace_probe_log_err(1, BAD_MAXACT_TYPE);
- goto parse_error;
- }
+ return -EINVAL;
trace_probe_log_set_index(0);
if (event) {
ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
if (ret)
- goto parse_error;
+ return -EINVAL;
}
if (!event) {
@@ -1073,69 +1175,81 @@ static int __trace_fprobe_create(int argc, const char *argv[])
}
if (is_return)
- ctx.flags |= TPARG_FL_RETURN;
+ ctx->flags |= TPARG_FL_RETURN;
else
- ctx.flags |= TPARG_FL_FENTRY;
+ ctx->flags |= TPARG_FL_FENTRY;
if (is_tracepoint) {
- ctx.flags |= TPARG_FL_TPOINT;
- tpoint = find_tracepoint(symbol);
- if (!tpoint) {
+ ctx->flags |= TPARG_FL_TPOINT;
+ tpoint = find_tracepoint(symbol, &tp_mod);
+ if (tpoint) {
+ ctx->funcname = kallsyms_lookup(
+ (unsigned long)tpoint->probestub,
+ NULL, NULL, NULL, sbuf);
+ } else if (IS_ENABLED(CONFIG_MODULES)) {
+ /* This *may* be loaded afterwards */
+ tpoint = TRACEPOINT_STUB;
+ ctx->funcname = symbol;
+ } else {
trace_probe_log_set_index(1);
trace_probe_log_err(0, NO_TRACEPOINT);
- goto parse_error;
+ return -EINVAL;
}
- ctx.funcname = kallsyms_lookup(
- (unsigned long)tpoint->probestub,
- NULL, NULL, NULL, sbuf);
} else
- ctx.funcname = symbol;
+ ctx->funcname = symbol;
argc -= 2; argv += 2;
new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc,
- abuf, MAX_BTF_ARGS_LEN, &ctx);
- if (IS_ERR(new_argv)) {
- ret = PTR_ERR(new_argv);
- new_argv = NULL;
- goto out;
- }
+ abuf, MAX_BTF_ARGS_LEN, ctx);
+ if (IS_ERR(new_argv))
+ return PTR_ERR(new_argv);
if (new_argv) {
argc = new_argc;
argv = new_argv;
}
+ if (argc > MAX_TRACE_ARGS) {
+ trace_probe_log_set_index(2);
+ trace_probe_log_err(0, TOO_MANY_ARGS);
+ return -E2BIG;
+ }
+
+ ret = traceprobe_expand_dentry_args(argc, argv, &dbuf);
+ if (ret)
+ return ret;
/* setup a probe */
- tf = alloc_trace_fprobe(group, event, symbol, tpoint, maxactive,
+ tf = alloc_trace_fprobe(group, event, symbol, tpoint, tp_mod,
argc, is_return);
if (IS_ERR(tf)) {
ret = PTR_ERR(tf);
/* This must return -ENOMEM, else there is a bug */
WARN_ON_ONCE(ret != -ENOMEM);
- goto out; /* We know tf is not allocated */
+ return ret;
}
- if (is_tracepoint)
- tf->mod = __module_text_address(
- (unsigned long)tf->tpoint->probestub);
-
/* parse arguments */
- for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ for (i = 0; i < argc; i++) {
trace_probe_log_set_index(i + 2);
- ctx.offset = 0;
- ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], &ctx);
+ ctx->offset = 0;
+ ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], ctx);
if (ret)
- goto error; /* This can be -ENOMEM */
+ return ret; /* This can be -ENOMEM */
}
if (is_return && tf->tp.entry_arg) {
tf->fp.entry_handler = trace_fprobe_entry_handler;
tf->fp.entry_data_size = traceprobe_get_entry_data_size(&tf->tp);
+ if (ALIGN(tf->fp.entry_data_size, sizeof(long)) > MAX_FPROBE_DATA_SIZE) {
+ trace_probe_log_set_index(2);
+ trace_probe_log_err(0, TOO_MANY_EARGS);
+ return -E2BIG;
+ }
}
ret = traceprobe_set_print_fmt(&tf->tp,
is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL);
if (ret < 0)
- goto error;
+ return ret;
ret = register_trace_fprobe(tf);
if (ret) {
@@ -1146,26 +1260,32 @@ static int __trace_fprobe_create(int argc, const char *argv[])
trace_probe_log_err(0, BAD_PROBE_ADDR);
else if (ret != -ENOMEM && ret != -EEXIST)
trace_probe_log_err(0, FAIL_REG_PROBE);
- goto error;
+ return -EINVAL;
}
-out:
+ /* 'tf' is successfully registered. To avoid freeing, assign NULL. */
+ tf = NULL;
+
+ return 0;
+}
+
+static int trace_fprobe_create_cb(int argc, const char *argv[])
+{
+ struct traceprobe_parse_context ctx = {
+ .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE,
+ };
+ int ret;
+
+ trace_probe_log_init("trace_fprobe", argc, argv);
+ ret = trace_fprobe_create_internal(argc, argv, &ctx);
traceprobe_finish_parse(&ctx);
trace_probe_log_clear();
- kfree(new_argv);
- kfree(symbol);
return ret;
-
-parse_error:
- ret = -EINVAL;
-error:
- free_trace_fprobe(tf);
- goto out;
}
static int trace_fprobe_create(const char *raw_command)
{
- return trace_probe_create(raw_command, __trace_fprobe_create);
+ return trace_probe_create(raw_command, trace_fprobe_create_cb);
}
static int trace_fprobe_release(struct dyn_event *ev)
@@ -1187,8 +1307,6 @@ static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev)
seq_putc(m, 't');
else
seq_putc(m, 'f');
- if (trace_fprobe_is_return(tf) && tf->fp.nr_maxactive)
- seq_printf(m, "%d", tf->fp.nr_maxactive);
seq_printf(m, ":%s/%s", trace_probe_group_name(&tf->tp),
trace_probe_name(&tf->tp));
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 9f1bfbe105e8..d17c18934445 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -25,6 +25,9 @@ static void
function_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs);
static void
+function_args_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct ftrace_regs *fregs);
+static void
function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs);
static void
@@ -42,9 +45,10 @@ enum {
TRACE_FUNC_NO_OPTS = 0x0, /* No flags set. */
TRACE_FUNC_OPT_STACK = 0x1,
TRACE_FUNC_OPT_NO_REPEATS = 0x2,
+ TRACE_FUNC_OPT_ARGS = 0x4,
/* Update this to next highest bit. */
- TRACE_FUNC_OPT_HIGHEST_BIT = 0x4
+ TRACE_FUNC_OPT_HIGHEST_BIT = 0x8
};
#define TRACE_FUNC_OPT_MASK (TRACE_FUNC_OPT_HIGHEST_BIT - 1)
@@ -80,6 +84,7 @@ void ftrace_free_ftrace_ops(struct trace_array *tr)
int ftrace_create_function_files(struct trace_array *tr,
struct dentry *parent)
{
+ int ret;
/*
* The top level array uses the "global_ops", and the files are
* created on boot up.
@@ -90,6 +95,12 @@ int ftrace_create_function_files(struct trace_array *tr,
if (!tr->ops)
return -EINVAL;
+ ret = allocate_fgraph_ops(tr, tr->ops);
+ if (ret) {
+ kfree(tr->ops);
+ return ret;
+ }
+
ftrace_create_filter_files(tr->ops, parent);
return 0;
@@ -99,6 +110,7 @@ void ftrace_destroy_function_files(struct trace_array *tr)
{
ftrace_destroy_filter_files(tr->ops);
ftrace_free_ftrace_ops(tr);
+ free_fgraph_ops(tr);
}
static ftrace_func_t select_trace_function(u32 flags_val)
@@ -106,6 +118,8 @@ static ftrace_func_t select_trace_function(u32 flags_val)
switch (flags_val & TRACE_FUNC_OPT_MASK) {
case TRACE_FUNC_NO_OPTS:
return function_trace_call;
+ case TRACE_FUNC_OPT_ARGS:
+ return function_args_trace_call;
case TRACE_FUNC_OPT_STACK:
return function_stack_trace_call;
case TRACE_FUNC_OPT_NO_REPEATS:
@@ -168,15 +182,59 @@ static void function_trace_start(struct trace_array *tr)
tracing_reset_online_cpus(&tr->array_buffer);
}
+/* fregs are guaranteed not to be NULL if HAVE_DYNAMIC_FTRACE_WITH_ARGS is set */
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) && defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS)
+static __always_inline unsigned long
+function_get_true_parent_ip(unsigned long parent_ip, struct ftrace_regs *fregs)
+{
+ unsigned long true_parent_ip;
+ int idx = 0;
+
+ true_parent_ip = parent_ip;
+ if (unlikely(parent_ip == (unsigned long)&return_to_handler) && fregs)
+ true_parent_ip = ftrace_graph_ret_addr(current, &idx, parent_ip,
+ (unsigned long *)ftrace_regs_get_stack_pointer(fregs));
+ return true_parent_ip;
+}
+#else
+static __always_inline unsigned long
+function_get_true_parent_ip(unsigned long parent_ip, struct ftrace_regs *fregs)
+{
+ return parent_ip;
+}
+#endif
+
static void
function_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs)
{
struct trace_array *tr = op->private;
- struct trace_array_cpu *data;
unsigned int trace_ctx;
int bit;
- int cpu;
+
+ if (unlikely(!tr->function_enabled))
+ return;
+
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0)
+ return;
+
+ parent_ip = function_get_true_parent_ip(parent_ip, fregs);
+
+ trace_ctx = tracing_gen_ctx_dec();
+
+ trace_function(tr, ip, parent_ip, trace_ctx, NULL);
+
+ ftrace_test_recursion_unlock(bit);
+}
+
+static void
+function_args_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
+{
+ struct trace_array *tr = op->private;
+ unsigned int trace_ctx;
+ int bit;
if (unlikely(!tr->function_enabled))
return;
@@ -187,10 +245,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
trace_ctx = tracing_gen_ctx();
- cpu = smp_processor_id();
- data = per_cpu_ptr(tr->array_buffer.data, cpu);
- if (!atomic_read(&data->disabled))
- trace_function(tr, ip, parent_ip, trace_ctx);
+ trace_function(tr, ip, parent_ip, trace_ctx, fregs);
ftrace_test_recursion_unlock(bit);
}
@@ -223,6 +278,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
long disabled;
int cpu;
unsigned int trace_ctx;
+ int skip = STACK_SKIP;
if (unlikely(!tr->function_enabled))
return;
@@ -232,17 +288,22 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
* recursive protection is performed.
*/
local_irq_save(flags);
+ parent_ip = function_get_true_parent_ip(parent_ip, fregs);
cpu = raw_smp_processor_id();
data = per_cpu_ptr(tr->array_buffer.data, cpu);
- disabled = atomic_inc_return(&data->disabled);
+ disabled = local_inc_return(&data->disabled);
if (likely(disabled == 1)) {
trace_ctx = tracing_gen_ctx_flags(flags);
- trace_function(tr, ip, parent_ip, trace_ctx);
- __trace_stack(tr, trace_ctx, STACK_SKIP);
+ trace_function(tr, ip, parent_ip, trace_ctx, NULL);
+#ifdef CONFIG_UNWINDER_FRAME_POINTER
+ if (ftrace_pids_enabled(op))
+ skip++;
+#endif
+ __trace_stack(tr, trace_ctx, skip);
}
- atomic_dec(&data->disabled);
+ local_dec(&data->disabled);
local_irq_restore(flags);
}
@@ -283,11 +344,8 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
{
struct trace_func_repeats *last_info;
struct trace_array *tr = op->private;
- struct trace_array_cpu *data;
unsigned int trace_ctx;
- unsigned long flags;
int bit;
- int cpu;
if (unlikely(!tr->function_enabled))
return;
@@ -296,9 +354,8 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
if (bit < 0)
return;
- cpu = smp_processor_id();
- data = per_cpu_ptr(tr->array_buffer.data, cpu);
- if (atomic_read(&data->disabled))
+ parent_ip = function_get_true_parent_ip(parent_ip, fregs);
+ if (!tracer_tracing_is_on(tr))
goto out;
/*
@@ -308,15 +365,14 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
* TODO: think about a solution that is better than just hoping to be
* lucky.
*/
- last_info = per_cpu_ptr(tr->last_func_repeats, cpu);
+ last_info = this_cpu_ptr(tr->last_func_repeats);
if (is_repeat_check(tr, last_info, ip, parent_ip))
goto out;
- local_save_flags(flags);
- trace_ctx = tracing_gen_ctx_flags(flags);
+ trace_ctx = tracing_gen_ctx_dec();
process_repeats(tr, ip, parent_ip, last_info, trace_ctx);
- trace_function(tr, ip, parent_ip, trace_ctx);
+ trace_function(tr, ip, parent_ip, trace_ctx, NULL);
out:
ftrace_test_recursion_unlock(bit);
@@ -343,9 +399,10 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
* recursive protection is performed.
*/
local_irq_save(flags);
+ parent_ip = function_get_true_parent_ip(parent_ip, fregs);
cpu = raw_smp_processor_id();
data = per_cpu_ptr(tr->array_buffer.data, cpu);
- disabled = atomic_inc_return(&data->disabled);
+ disabled = local_inc_return(&data->disabled);
if (likely(disabled == 1)) {
last_info = per_cpu_ptr(tr->last_func_repeats, cpu);
@@ -355,12 +412,12 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
trace_ctx = tracing_gen_ctx_flags(flags);
process_repeats(tr, ip, parent_ip, last_info, trace_ctx);
- trace_function(tr, ip, parent_ip, trace_ctx);
+ trace_function(tr, ip, parent_ip, trace_ctx, NULL);
__trace_stack(tr, trace_ctx, STACK_SKIP);
}
out:
- atomic_dec(&data->disabled);
+ local_dec(&data->disabled);
local_irq_restore(flags);
}
@@ -369,6 +426,9 @@ static struct tracer_opt func_opts[] = {
{ TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
#endif
{ TRACER_OPT(func-no-repeats, TRACE_FUNC_OPT_NO_REPEATS) },
+#ifdef CONFIG_FUNCTION_TRACE_ARGS
+ { TRACER_OPT(func-args, TRACE_FUNC_OPT_ARGS) },
+#endif
{ } /* Always set a last empty entry */
};
@@ -563,11 +623,7 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip,
static __always_inline void trace_stack(struct trace_array *tr)
{
- unsigned int trace_ctx;
-
- trace_ctx = tracing_gen_ctx();
-
- __trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP);
+ __trace_stack(tr, tracing_gen_ctx_dec(), FTRACE_STACK_SKIP);
}
static void
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index c35fbaab2a47..9234e2c39abf 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -31,7 +31,10 @@ struct fgraph_data {
struct fgraph_cpu_data __percpu *cpu_data;
/* Place to preserve last processed entry. */
- struct ftrace_graph_ent_entry ent;
+ union {
+ struct ftrace_graph_ent_entry ent;
+ struct fgraph_retaddr_ent_entry rent;
+ } ent;
struct ftrace_graph_ret_entry ret;
int failed;
int cpu;
@@ -64,6 +67,14 @@ static struct tracer_opt trace_opts[] = {
/* Display function return value in hexadecimal format ? */
{ TRACER_OPT(funcgraph-retval-hex, TRACE_GRAPH_PRINT_RETVAL_HEX) },
#endif
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+ /* Display function return address ? */
+ { TRACER_OPT(funcgraph-retaddr, TRACE_GRAPH_PRINT_RETADDR) },
+#endif
+#ifdef CONFIG_FUNCTION_TRACE_ARGS
+ /* Display function arguments ? */
+ { TRACER_OPT(funcgraph-args, TRACE_GRAPH_ARGS) },
+#endif
/* Include sleep time (scheduled out) between entry and return */
{ TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) },
@@ -83,7 +94,10 @@ static struct tracer_flags tracer_flags = {
.opts = trace_opts
};
-static struct trace_array *graph_array;
+static bool tracer_flags_is_set(u32 flags)
+{
+ return (tracer_flags.val & flags) == flags;
+}
/*
* DURATION column is being also used to display IRQ signs,
@@ -100,26 +114,74 @@ static void
print_graph_duration(struct trace_array *tr, unsigned long long duration,
struct trace_seq *s, u32 flags);
+static int __graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace,
+ unsigned int trace_ctx, struct ftrace_regs *fregs)
+{
+ struct ring_buffer_event *event;
+ struct trace_buffer *buffer = tr->array_buffer.buffer;
+ struct ftrace_graph_ent_entry *entry;
+ int size;
+
+ /* If fregs is defined, add FTRACE_REGS_MAX_ARGS long size words */
+ size = sizeof(*entry) + (FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long));
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, size, trace_ctx);
+ if (!event)
+ return 0;
+
+ entry = ring_buffer_event_data(event);
+ entry->graph_ent = *trace;
+
+#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+ if (fregs) {
+ for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++)
+ entry->args[i] = ftrace_regs_get_argument(fregs, i);
+ }
+#endif
+
+ trace_buffer_unlock_commit_nostack(buffer, event);
+
+ return 1;
+}
+
int __trace_graph_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
unsigned int trace_ctx)
{
- struct trace_event_call *call = &event_funcgraph_entry;
+ return __graph_entry(tr, trace, trace_ctx, NULL);
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+int __trace_graph_retaddr_entry(struct trace_array *tr,
+ struct ftrace_graph_ent *trace,
+ unsigned int trace_ctx,
+ unsigned long retaddr)
+{
struct ring_buffer_event *event;
struct trace_buffer *buffer = tr->array_buffer.buffer;
- struct ftrace_graph_ent_entry *entry;
+ struct fgraph_retaddr_ent_entry *entry;
- event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
+ event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RETADDR_ENT,
sizeof(*entry), trace_ctx);
if (!event)
return 0;
entry = ring_buffer_event_data(event);
- entry->graph_ent = *trace;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit_nostack(buffer, event);
+ entry->graph_ent.func = trace->func;
+ entry->graph_ent.depth = trace->depth;
+ entry->graph_ent.retaddr = retaddr;
+ trace_buffer_unlock_commit_nostack(buffer, event);
return 1;
}
+#else
+int __trace_graph_retaddr_entry(struct trace_array *tr,
+ struct ftrace_graph_ent *trace,
+ unsigned int trace_ctx,
+ unsigned long retaddr)
+{
+ return 1;
+}
+#endif
static inline int ftrace_graph_ignore_irqs(void)
{
@@ -129,17 +191,22 @@ static inline int ftrace_graph_ignore_irqs(void)
return in_hardirq();
}
-int trace_graph_entry(struct ftrace_graph_ent *trace)
+struct fgraph_times {
+ unsigned long long calltime;
+ unsigned long long sleeptime; /* may be optional! */
+};
+
+static int graph_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
- struct trace_array *tr = graph_array;
- struct trace_array_cpu *data;
- unsigned long flags;
+ unsigned long *task_var = fgraph_get_task_var(gops);
+ struct trace_array *tr = gops->private;
+ struct fgraph_times *ftimes;
unsigned int trace_ctx;
- long disabled;
- int ret;
- int cpu;
+ int ret = 0;
- if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT))
+ if (*task_var & TRACE_GRAPH_NOTRACE)
return 0;
/*
@@ -150,7 +217,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
* returning from the function.
*/
if (ftrace_graph_notrace_addr(trace->func)) {
- trace_recursion_set(TRACE_GRAPH_NOTRACE_BIT);
+ *task_var |= TRACE_GRAPH_NOTRACE;
/*
* Need to return 1 to have the return called
* that will clear the NOTRACE bit.
@@ -161,12 +228,25 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
if (!ftrace_trace_task(tr))
return 0;
- if (ftrace_graph_ignore_func(trace))
+ if (ftrace_graph_ignore_func(gops, trace))
return 0;
if (ftrace_graph_ignore_irqs())
return 0;
+ if (fgraph_sleep_time) {
+ /* Only need to record the calltime */
+ ftimes = fgraph_reserve_data(gops->idx, sizeof(ftimes->calltime));
+ } else {
+ ftimes = fgraph_reserve_data(gops->idx, sizeof(*ftimes));
+ if (ftimes)
+ ftimes->sleeptime = current->ftrace_sleeptime;
+ }
+ if (!ftimes)
+ return 0;
+
+ ftimes->calltime = trace_clock_local();
+
/*
* Stop here if tracing_threshold is set. We only write function return
* events to the ring buffer.
@@ -174,23 +254,32 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
if (tracing_thresh)
return 1;
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = per_cpu_ptr(tr->array_buffer.data, cpu);
- disabled = atomic_inc_return(&data->disabled);
- if (likely(disabled == 1)) {
- trace_ctx = tracing_gen_ctx_flags(flags);
- ret = __trace_graph_entry(tr, trace, trace_ctx);
+ trace_ctx = tracing_gen_ctx();
+ if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) &&
+ tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR)) {
+ unsigned long retaddr = ftrace_graph_top_ret_addr(current);
+ ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr);
} else {
- ret = 0;
+ ret = __graph_entry(tr, trace, trace_ctx, fregs);
}
- atomic_dec(&data->disabled);
- local_irq_restore(flags);
-
return ret;
}
+int trace_graph_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
+{
+ return graph_entry(trace, gops, NULL);
+}
+
+static int trace_graph_entry_args(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
+{
+ return graph_entry(trace, gops, fregs);
+}
+
static void
__trace_graph_function(struct trace_array *tr,
unsigned long ip, unsigned int trace_ctx)
@@ -203,12 +292,10 @@ __trace_graph_function(struct trace_array *tr,
struct ftrace_graph_ret ret = {
.func = ip,
.depth = 0,
- .calltime = time,
- .rettime = time,
};
__trace_graph_entry(tr, &ent, trace_ctx);
- __trace_graph_return(tr, &ret, trace_ctx);
+ __trace_graph_return(tr, &ret, trace_ctx, time, time);
}
void
@@ -220,10 +307,10 @@ trace_graph_function(struct trace_array *tr,
}
void __trace_graph_return(struct trace_array *tr,
- struct ftrace_graph_ret *trace,
- unsigned int trace_ctx)
+ struct ftrace_graph_ret *trace,
+ unsigned int trace_ctx,
+ u64 calltime, u64 rettime)
{
- struct trace_event_call *call = &event_funcgraph_exit;
struct ring_buffer_event *event;
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ftrace_graph_ret_entry *entry;
@@ -234,82 +321,133 @@ void __trace_graph_return(struct trace_array *tr,
return;
entry = ring_buffer_event_data(event);
entry->ret = *trace;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit_nostack(buffer, event);
+ entry->calltime = calltime;
+ entry->rettime = rettime;
+ trace_buffer_unlock_commit_nostack(buffer, event);
}
-void trace_graph_return(struct ftrace_graph_ret *trace)
+static void handle_nosleeptime(struct ftrace_graph_ret *trace,
+ struct fgraph_times *ftimes,
+ int size)
{
- struct trace_array *tr = graph_array;
- struct trace_array_cpu *data;
- unsigned long flags;
+ if (fgraph_sleep_time || size < sizeof(*ftimes))
+ return;
+
+ ftimes->calltime += current->ftrace_sleeptime - ftimes->sleeptime;
+}
+
+void trace_graph_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops, struct ftrace_regs *fregs)
+{
+ unsigned long *task_var = fgraph_get_task_var(gops);
+ struct trace_array *tr = gops->private;
+ struct fgraph_times *ftimes;
unsigned int trace_ctx;
- long disabled;
- int cpu;
+ u64 calltime, rettime;
+ int size;
- ftrace_graph_addr_finish(trace);
+ rettime = trace_clock_local();
- if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) {
- trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT);
+ ftrace_graph_addr_finish(gops, trace);
+
+ if (*task_var & TRACE_GRAPH_NOTRACE) {
+ *task_var &= ~TRACE_GRAPH_NOTRACE;
return;
}
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = per_cpu_ptr(tr->array_buffer.data, cpu);
- disabled = atomic_inc_return(&data->disabled);
- if (likely(disabled == 1)) {
- trace_ctx = tracing_gen_ctx_flags(flags);
- __trace_graph_return(tr, trace, trace_ctx);
- }
- atomic_dec(&data->disabled);
- local_irq_restore(flags);
-}
+ ftimes = fgraph_retrieve_data(gops->idx, &size);
+ if (!ftimes)
+ return;
-void set_graph_array(struct trace_array *tr)
-{
- graph_array = tr;
+ handle_nosleeptime(trace, ftimes, size);
- /* Make graph_array visible before we start tracing */
+ calltime = ftimes->calltime;
- smp_mb();
+ trace_ctx = tracing_gen_ctx();
+ __trace_graph_return(tr, trace, trace_ctx, calltime, rettime);
}
-static void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
+static void trace_graph_thresh_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
- ftrace_graph_addr_finish(trace);
+ struct fgraph_times *ftimes;
+ int size;
+
+ ftrace_graph_addr_finish(gops, trace);
if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) {
trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT);
return;
}
+ ftimes = fgraph_retrieve_data(gops->idx, &size);
+ if (!ftimes)
+ return;
+
+ handle_nosleeptime(trace, ftimes, size);
+
if (tracing_thresh &&
- (trace->rettime - trace->calltime < tracing_thresh))
+ (trace_clock_local() - ftimes->calltime < tracing_thresh))
return;
else
- trace_graph_return(trace);
+ trace_graph_return(trace, gops, fregs);
}
-static struct fgraph_ops funcgraph_thresh_ops = {
- .entryfunc = &trace_graph_entry,
- .retfunc = &trace_graph_thresh_return,
-};
-
static struct fgraph_ops funcgraph_ops = {
.entryfunc = &trace_graph_entry,
.retfunc = &trace_graph_return,
};
+int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops)
+{
+ struct fgraph_ops *gops;
+
+ gops = kzalloc(sizeof(*gops), GFP_KERNEL);
+ if (!gops)
+ return -ENOMEM;
+
+ gops->entryfunc = &trace_graph_entry;
+ gops->retfunc = &trace_graph_return;
+
+ tr->gops = gops;
+ gops->private = tr;
+
+ fgraph_init_ops(&gops->ops, ops);
+
+ return 0;
+}
+
+void free_fgraph_ops(struct trace_array *tr)
+{
+ kfree(tr->gops);
+}
+
+__init void init_array_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops)
+{
+ tr->gops = &funcgraph_ops;
+ funcgraph_ops.private = tr;
+ fgraph_init_ops(&tr->gops->ops, ops);
+}
+
static int graph_trace_init(struct trace_array *tr)
{
int ret;
- set_graph_array(tr);
+ if (tracer_flags_is_set(TRACE_GRAPH_ARGS))
+ tr->gops->entryfunc = trace_graph_entry_args;
+ else
+ tr->gops->entryfunc = trace_graph_entry;
+
if (tracing_thresh)
- ret = register_ftrace_graph(&funcgraph_thresh_ops);
+ tr->gops->retfunc = trace_graph_thresh_return;
else
- ret = register_ftrace_graph(&funcgraph_ops);
+ tr->gops->retfunc = trace_graph_return;
+
+ /* Make gops functions visible before we start tracing */
+ smp_mb();
+
+ ret = register_ftrace_graph(tr->gops);
if (ret)
return ret;
tracing_start_cmdline_record();
@@ -317,13 +455,32 @@ static int graph_trace_init(struct trace_array *tr)
return 0;
}
+static int ftrace_graph_trace_args(struct trace_array *tr, int set)
+{
+ trace_func_graph_ent_t entry;
+
+ if (set)
+ entry = trace_graph_entry_args;
+ else
+ entry = trace_graph_entry;
+
+ /* See if there's any changes */
+ if (tr->gops->entryfunc == entry)
+ return 0;
+
+ unregister_ftrace_graph(tr->gops);
+
+ tr->gops->entryfunc = entry;
+
+ /* Make gops functions visible before we start tracing */
+ smp_mb();
+ return register_ftrace_graph(tr->gops);
+}
+
static void graph_trace_reset(struct trace_array *tr)
{
tracing_stop_cmdline_record();
- if (tracing_thresh)
- unregister_ftrace_graph(&funcgraph_thresh_ops);
- else
- unregister_ftrace_graph(&funcgraph_ops);
+ unregister_ftrace_graph(tr->gops);
}
static int graph_trace_update_thresh(struct trace_array *tr)
@@ -434,7 +591,7 @@ get_return_for_leaf(struct trace_iterator *iter,
* then we just reuse the data from before.
*/
if (data && data->failed) {
- curr = &data->ent;
+ curr = &data->ent.ent;
next = &data->ret;
} else {
@@ -464,7 +621,10 @@ get_return_for_leaf(struct trace_iterator *iter,
* Save current and next entries for later reference
* if the output fails.
*/
- data->ent = *curr;
+ if (unlikely(curr->ent.type == TRACE_GRAPH_RETADDR_ENT))
+ data->ent.rent = *(struct fgraph_retaddr_ent_entry *)curr;
+ else
+ data->ent.ent = *curr;
/*
* If the next event is not a return type, then
* we only care about what type it is. Otherwise we can
@@ -521,6 +681,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
struct trace_seq *s = &iter->seq;
struct trace_entry *ent = iter->ent;
+ addr += iter->tr->text_delta;
+
if (addr < (unsigned long)__irqentry_text_start ||
addr >= (unsigned long)__irqentry_text_end)
return;
@@ -626,52 +788,102 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,
}
#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
-
#define __TRACE_GRAPH_PRINT_RETVAL TRACE_GRAPH_PRINT_RETVAL
+#else
+#define __TRACE_GRAPH_PRINT_RETVAL 0
+#endif
-static void print_graph_retval(struct trace_seq *s, unsigned long retval,
- bool leaf, void *func, bool hex_format)
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+#define __TRACE_GRAPH_PRINT_RETADDR TRACE_GRAPH_PRINT_RETADDR
+static void print_graph_retaddr(struct trace_seq *s, struct fgraph_retaddr_ent_entry *entry,
+ u32 trace_flags, bool comment)
+{
+ if (comment)
+ trace_seq_puts(s, " /*");
+
+ trace_seq_puts(s, " <-");
+ seq_print_ip_sym(s, entry->graph_ent.retaddr, trace_flags | TRACE_ITER_SYM_OFFSET);
+
+ if (comment)
+ trace_seq_puts(s, " */");
+}
+#else
+#define __TRACE_GRAPH_PRINT_RETADDR 0
+#define print_graph_retaddr(_seq, _entry, _tflags, _comment) do { } while (0)
+#endif
+
+#if defined(CONFIG_FUNCTION_GRAPH_RETVAL) || defined(CONFIG_FUNCTION_GRAPH_RETADDR)
+
+static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entry *entry,
+ struct ftrace_graph_ret *graph_ret, void *func,
+ u32 opt_flags, u32 trace_flags, int args_size)
{
unsigned long err_code = 0;
+ unsigned long retval = 0;
+ bool print_retaddr = false;
+ bool print_retval = false;
+ bool hex_format = !!(opt_flags & TRACE_GRAPH_PRINT_RETVAL_HEX);
- if (retval == 0 || hex_format)
- goto done;
+#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
+ retval = graph_ret->retval;
+ print_retval = !!(opt_flags & TRACE_GRAPH_PRINT_RETVAL);
+#endif
- /* Check if the return value matches the negative format */
- if (IS_ENABLED(CONFIG_64BIT) && (retval & BIT(31)) &&
- (((u64)retval) >> 32) == 0) {
- /* sign extension */
- err_code = (unsigned long)(s32)retval;
- } else {
- err_code = retval;
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+ print_retaddr = !!(opt_flags & TRACE_GRAPH_PRINT_RETADDR);
+#endif
+
+ if (print_retval && retval && !hex_format) {
+ /* Check if the return value matches the negative format */
+ if (IS_ENABLED(CONFIG_64BIT) && (retval & BIT(31)) &&
+ (((u64)retval) >> 32) == 0) {
+ err_code = sign_extend64(retval, 31);
+ } else {
+ err_code = retval;
+ }
+
+ if (!IS_ERR_VALUE(err_code))
+ err_code = 0;
}
- if (!IS_ERR_VALUE(err_code))
- err_code = 0;
+ if (entry) {
+ if (entry->ent.type != TRACE_GRAPH_RETADDR_ENT)
+ print_retaddr = false;
-done:
- if (leaf) {
- if (hex_format || (err_code == 0))
- trace_seq_printf(s, "%ps(); /* = 0x%lx */\n",
- func, retval);
- else
- trace_seq_printf(s, "%ps(); /* = %ld */\n",
- func, err_code);
+ trace_seq_printf(s, "%ps", func);
+
+ if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) {
+ print_function_args(s, entry->args, (unsigned long)func);
+ trace_seq_putc(s, ';');
+ } else
+ trace_seq_puts(s, "();");
+
+ if (print_retval || print_retaddr)
+ trace_seq_puts(s, " /*");
} else {
+ print_retaddr = false;
+ trace_seq_printf(s, "} /* %ps", func);
+ }
+
+ if (print_retaddr)
+ print_graph_retaddr(s, (struct fgraph_retaddr_ent_entry *)entry,
+ trace_flags, false);
+
+ if (print_retval) {
if (hex_format || (err_code == 0))
- trace_seq_printf(s, "} /* %ps = 0x%lx */\n",
- func, retval);
+ trace_seq_printf(s, " ret=0x%lx", retval);
else
- trace_seq_printf(s, "} /* %ps = %ld */\n",
- func, err_code);
+ trace_seq_printf(s, " ret=%ld", err_code);
}
+
+ if (!entry || print_retval || print_retaddr)
+ trace_seq_puts(s, " */");
}
#else
-#define __TRACE_GRAPH_PRINT_RETVAL 0
-
-#define print_graph_retval(_seq, _retval, _leaf, _func, _format) do {} while (0)
+#define print_graph_retval(_seq, _ent, _ret, _func, _opt_flags, _trace_flags, args_size) \
+ do {} while (0)
#endif
@@ -687,12 +899,16 @@ print_graph_entry_leaf(struct trace_iterator *iter,
struct ftrace_graph_ret *graph_ret;
struct ftrace_graph_ent *call;
unsigned long long duration;
+ unsigned long ret_func;
+ int args_size;
int cpu = iter->cpu;
int i;
+ args_size = iter->ent_size - offsetof(struct ftrace_graph_ent_entry, args);
+
graph_ret = &ret_entry->ret;
call = &entry->graph_ent;
- duration = graph_ret->rettime - graph_ret->calltime;
+ duration = ret_entry->rettime - ret_entry->calltime;
if (data) {
struct fgraph_cpu_data *cpu_data;
@@ -719,15 +935,25 @@ print_graph_entry_leaf(struct trace_iterator *iter,
for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
trace_seq_putc(s, ' ');
+ ret_func = graph_ret->func + iter->tr->text_delta;
+
/*
- * Write out the function return value if the option function-retval is
- * enabled.
+ * Write out the function return value or return address
*/
- if (flags & __TRACE_GRAPH_PRINT_RETVAL)
- print_graph_retval(s, graph_ret->retval, true, (void *)call->func,
- !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX));
- else
- trace_seq_printf(s, "%ps();\n", (void *)call->func);
+ if (flags & (__TRACE_GRAPH_PRINT_RETVAL | __TRACE_GRAPH_PRINT_RETADDR)) {
+ print_graph_retval(s, entry, graph_ret,
+ (void *)graph_ret->func + iter->tr->text_delta,
+ flags, tr->trace_flags, args_size);
+ } else {
+ trace_seq_printf(s, "%ps", (void *)ret_func);
+
+ if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) {
+ print_function_args(s, entry->args, ret_func);
+ trace_seq_putc(s, ';');
+ } else
+ trace_seq_puts(s, "();");
+ }
+ trace_seq_putc(s, '\n');
print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET,
cpu, iter->ent->pid, flags);
@@ -743,6 +969,8 @@ print_graph_entry_nested(struct trace_iterator *iter,
struct ftrace_graph_ent *call = &entry->graph_ent;
struct fgraph_data *data = iter->private;
struct trace_array *tr = iter->tr;
+ unsigned long func;
+ int args_size;
int i;
if (data) {
@@ -765,7 +993,24 @@ print_graph_entry_nested(struct trace_iterator *iter,
for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
trace_seq_putc(s, ' ');
- trace_seq_printf(s, "%ps() {\n", (void *)call->func);
+ func = call->func + iter->tr->text_delta;
+
+ trace_seq_printf(s, "%ps", (void *)func);
+
+ args_size = iter->ent_size - offsetof(struct ftrace_graph_ent_entry, args);
+
+ if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long))
+ print_function_args(s, entry->args, func);
+ else
+ trace_seq_puts(s, "()");
+
+ trace_seq_puts(s, " {");
+
+ if (flags & __TRACE_GRAPH_PRINT_RETADDR &&
+ entry->ent.type == TRACE_GRAPH_RETADDR_ENT)
+ print_graph_retaddr(s, (struct fgraph_retaddr_ent_entry *)entry,
+ tr->trace_flags, true);
+ trace_seq_putc(s, '\n');
if (trace_seq_has_overflowed(s))
return TRACE_TYPE_PARTIAL_LINE;
@@ -840,6 +1085,8 @@ check_irq_entry(struct trace_iterator *iter, u32 flags,
int *depth_irq;
struct fgraph_data *data = iter->private;
+ addr += iter->tr->text_delta;
+
/*
* If we are either displaying irqs, or we got called as
* a graph event and private data does not exist,
@@ -928,21 +1175,38 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
struct trace_iterator *iter, u32 flags)
{
struct fgraph_data *data = iter->private;
- struct ftrace_graph_ent *call = &field->graph_ent;
+ struct ftrace_graph_ent *call;
struct ftrace_graph_ret_entry *leaf_ret;
static enum print_line_t ret;
int cpu = iter->cpu;
+ /*
+ * print_graph_entry() may consume the current event,
+ * thus @field may become invalid, so we need to save it.
+ * sizeof(struct ftrace_graph_ent_entry) is very small,
+ * it can be safely saved at the stack.
+ */
+ struct ftrace_graph_ent_entry *entry;
+ u8 save_buf[sizeof(*entry) + FTRACE_REGS_MAX_ARGS * sizeof(long)];
+
+ /* The ent_size is expected to be as big as the entry */
+ if (iter->ent_size > sizeof(save_buf))
+ iter->ent_size = sizeof(save_buf);
+
+ entry = (void *)save_buf;
+ memcpy(entry, field, iter->ent_size);
+
+ call = &entry->graph_ent;
if (check_irq_entry(iter, flags, call->func, call->depth))
return TRACE_TYPE_HANDLED;
print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags);
- leaf_ret = get_return_for_leaf(iter, field);
+ leaf_ret = get_return_for_leaf(iter, entry);
if (leaf_ret)
- ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags);
+ ret = print_graph_entry_leaf(iter, entry, leaf_ret, s, flags);
else
- ret = print_graph_entry_nested(iter, field, s, cpu, flags);
+ ret = print_graph_entry_nested(iter, entry, s, cpu, flags);
if (data) {
/*
@@ -960,18 +1224,24 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
}
static enum print_line_t
-print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
+print_graph_return(struct ftrace_graph_ret_entry *retentry, struct trace_seq *s,
struct trace_entry *ent, struct trace_iterator *iter,
u32 flags)
{
- unsigned long long duration = trace->rettime - trace->calltime;
+ struct ftrace_graph_ret *trace = &retentry->ret;
+ u64 calltime = retentry->calltime;
+ u64 rettime = retentry->rettime;
+ unsigned long long duration = rettime - calltime;
struct fgraph_data *data = iter->private;
struct trace_array *tr = iter->tr;
+ unsigned long func;
pid_t pid = ent->pid;
int cpu = iter->cpu;
int func_match = 1;
int i;
+ func = trace->func + iter->tr->text_delta;
+
if (check_irq_return(iter, flags, trace->depth))
return TRACE_TYPE_HANDLED;
@@ -1007,11 +1277,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
/*
* Always write out the function name and its return value if the
- * function-retval option is enabled.
+ * funcgraph-retval option is enabled.
*/
if (flags & __TRACE_GRAPH_PRINT_RETVAL) {
- print_graph_retval(s, trace->retval, false, (void *)trace->func,
- !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX));
+ print_graph_retval(s, NULL, trace, (void *)func, flags,
+ tr->trace_flags, 0);
} else {
/*
* If the return function does not have a matching entry,
@@ -1021,10 +1291,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
* that if the funcgraph-tail option is enabled.
*/
if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL))
- trace_seq_puts(s, "}\n");
+ trace_seq_puts(s, "}");
else
- trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
+ trace_seq_printf(s, "} /* %ps */", (void *)func);
}
+ trace_seq_putc(s, '\n');
/* Overrun */
if (flags & TRACE_GRAPH_PRINT_OVERRUN)
@@ -1126,7 +1397,7 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
* to print out the missing entry which would never go out.
*/
if (data && data->failed) {
- field = &data->ent;
+ field = &data->ent.ent;
iter->cpu = data->cpu;
ret = print_graph_entry(field, s, iter, flags);
if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
@@ -1139,21 +1410,23 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
switch (entry->type) {
case TRACE_GRAPH_ENT: {
- /*
- * print_graph_entry() may consume the current event,
- * thus @field may become invalid, so we need to save it.
- * sizeof(struct ftrace_graph_ent_entry) is very small,
- * it can be safely saved at the stack.
- */
- struct ftrace_graph_ent_entry saved;
trace_assign_type(field, entry);
- saved = *field;
- return print_graph_entry(&saved, s, iter, flags);
+ return print_graph_entry(field, s, iter, flags);
+ }
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+ case TRACE_GRAPH_RETADDR_ENT: {
+ struct fgraph_retaddr_ent_entry saved;
+ struct fgraph_retaddr_ent_entry *rfield;
+
+ trace_assign_type(rfield, entry);
+ saved = *rfield;
+ return print_graph_entry((struct ftrace_graph_ent_entry *)&saved, s, iter, flags);
}
+#endif
case TRACE_GRAPH_RET: {
struct ftrace_graph_ret_entry *field;
trace_assign_type(field, entry);
- return print_graph_return(&field->ret, s, entry, iter, flags);
+ return print_graph_return(field, s, entry, iter, flags);
}
case TRACE_STACK:
case TRACE_FN:
@@ -1317,6 +1590,7 @@ void graph_trace_close(struct trace_iterator *iter)
if (data) {
free_percpu(data->cpu_data);
kfree(data);
+ iter->private = NULL;
}
}
@@ -1332,6 +1606,9 @@ func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
if (bit == TRACE_GRAPH_GRAPH_TIME)
ftrace_graph_graph_time_control(set);
+ if (bit == TRACE_GRAPH_ARGS)
+ return ftrace_graph_trace_args(tr, set);
+
return 0;
}
@@ -1344,6 +1621,13 @@ static struct trace_event graph_trace_entry_event = {
.funcs = &graph_functions,
};
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+static struct trace_event graph_trace_retaddr_entry_event = {
+ .type = TRACE_GRAPH_RETADDR_ENT,
+ .funcs = &graph_functions,
+};
+#endif
+
static struct trace_event graph_trace_ret_event = {
.type = TRACE_GRAPH_RET,
.funcs = &graph_functions
@@ -1362,6 +1646,7 @@ static struct tracer graph_trace __tracer_data = {
.print_header = print_graph_headers,
.flags = &tracer_flags,
.set_flag = func_graph_set_flag,
+ .allow_instances = true,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_function_graph,
#endif
@@ -1429,6 +1714,13 @@ static __init int init_graph_trace(void)
return 1;
}
+#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
+ if (!register_trace_event(&graph_trace_retaddr_entry_event)) {
+ pr_warn("Warning: could not register graph trace retaddr events\n");
+ return 1;
+ }
+#endif
+
if (!register_trace_event(&graph_trace_ret_event)) {
pr_warn("Warning: could not register graph trace events\n");
return 1;
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index b791524a6536..b65353ec2837 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -130,7 +130,6 @@ static bool hwlat_busy;
static void trace_hwlat_sample(struct hwlat_sample *sample)
{
struct trace_array *tr = hwlat_trace;
- struct trace_event_call *call = &event_hwlat;
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct hwlat_entry *entry;
@@ -148,8 +147,7 @@ static void trace_hwlat_sample(struct hwlat_sample *sample)
entry->nmi_count = sample->nmi_count;
entry->count = sample->count;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit_nostack(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
}
/* Macros to encapsulate the time capturing infrastructure */
@@ -520,6 +518,8 @@ static void hwlat_hotplug_workfn(struct work_struct *dummy)
if (!hwlat_busy || hwlat_data.thread_mode != MODE_PER_CPU)
goto out_unlock;
+ if (!cpu_online(cpu))
+ goto out_unlock;
if (!cpumask_test_cpu(cpu, tr->tracing_cpumask))
goto out_unlock;
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index ba37f768e2f2..5496758b6c76 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -123,12 +123,12 @@ static int func_prolog_dec(struct trace_array *tr,
return 0;
*data = per_cpu_ptr(tr->array_buffer.data, cpu);
- disabled = atomic_inc_return(&(*data)->disabled);
+ disabled = local_inc_return(&(*data)->disabled);
if (likely(disabled == 1))
return 1;
- atomic_dec(&(*data)->disabled);
+ local_dec(&(*data)->disabled);
return 0;
}
@@ -150,9 +150,9 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
trace_ctx = tracing_gen_ctx_flags(flags);
- trace_function(tr, ip, parent_ip, trace_ctx);
+ trace_function(tr, ip, parent_ip, trace_ctx, fregs);
- atomic_dec(&data->disabled);
+ local_dec(&data->disabled);
}
#endif /* CONFIG_FUNCTION_TRACER */
@@ -175,15 +175,18 @@ static int irqsoff_display_graph(struct trace_array *tr, int set)
return start_irqsoff_tracer(irqsoff_trace, set);
}
-static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
+static int irqsoff_graph_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
unsigned long flags;
unsigned int trace_ctx;
+ u64 *calltime;
int ret;
- if (ftrace_graph_ignore_func(trace))
+ if (ftrace_graph_ignore_func(gops, trace))
return 0;
/*
* Do not trace a function if it's filtered by set_graph_notrace.
@@ -198,28 +201,44 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
if (!func_prolog_dec(tr, &data, &flags))
return 0;
+ calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime));
+ if (!calltime)
+ return 0;
+
+ *calltime = trace_clock_local();
+
trace_ctx = tracing_gen_ctx_flags(flags);
ret = __trace_graph_entry(tr, trace, trace_ctx);
- atomic_dec(&data->disabled);
+ local_dec(&data->disabled);
return ret;
}
-static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
+static void irqsoff_graph_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
unsigned long flags;
unsigned int trace_ctx;
+ u64 *calltime;
+ u64 rettime;
+ int size;
- ftrace_graph_addr_finish(trace);
+ ftrace_graph_addr_finish(gops, trace);
if (!func_prolog_dec(tr, &data, &flags))
return;
+ rettime = trace_clock_local();
+ calltime = fgraph_retrieve_data(gops->idx, &size);
+ if (!calltime)
+ return;
+
trace_ctx = tracing_gen_ctx_flags(flags);
- __trace_graph_return(tr, trace, trace_ctx);
- atomic_dec(&data->disabled);
+ __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime);
+ local_dec(&data->disabled);
}
static struct fgraph_ops fgraph_ops = {
@@ -231,8 +250,6 @@ static void irqsoff_trace_open(struct trace_iterator *iter)
{
if (is_graph(iter->tr))
graph_trace_open(iter);
- else
- iter->private = NULL;
}
static void irqsoff_trace_close(struct trace_iterator *iter)
@@ -276,11 +293,17 @@ __trace_function(struct trace_array *tr,
if (is_graph(tr))
trace_graph_function(tr, ip, parent_ip, trace_ctx);
else
- trace_function(tr, ip, parent_ip, trace_ctx);
+ trace_function(tr, ip, parent_ip, trace_ctx, NULL);
}
#else
-#define __trace_function trace_function
+static inline void
+__trace_function(struct trace_array *tr,
+ unsigned long ip, unsigned long parent_ip,
+ unsigned int trace_ctx)
+{
+ return trace_function(tr, ip, parent_ip, trace_ctx, NULL);
+}
static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
{
@@ -374,6 +397,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
int cpu;
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
+ long disabled;
if (!tracer_enabled || !tracing_is_enabled())
return;
@@ -385,20 +409,22 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
data = per_cpu_ptr(tr->array_buffer.data, cpu);
- if (unlikely(!data) || atomic_read(&data->disabled))
+ if (unlikely(!data) || local_read(&data->disabled))
return;
- atomic_inc(&data->disabled);
+ disabled = local_inc_return(&data->disabled);
- data->critical_sequence = max_sequence;
- data->preempt_timestamp = ftrace_now(cpu);
- data->critical_start = parent_ip ? : ip;
+ if (disabled == 1) {
+ data->critical_sequence = max_sequence;
+ data->preempt_timestamp = ftrace_now(cpu);
+ data->critical_start = parent_ip ? : ip;
- __trace_function(tr, ip, parent_ip, tracing_gen_ctx());
+ __trace_function(tr, ip, parent_ip, tracing_gen_ctx());
- per_cpu(tracing_cpu, cpu) = 1;
+ per_cpu(tracing_cpu, cpu) = 1;
+ }
- atomic_dec(&data->disabled);
+ local_dec(&data->disabled);
}
static nokprobe_inline void
@@ -408,6 +434,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
unsigned int trace_ctx;
+ long disabled;
cpu = raw_smp_processor_id();
/* Always clear the tracing cpu on stopping the trace */
@@ -422,16 +449,19 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
data = per_cpu_ptr(tr->array_buffer.data, cpu);
if (unlikely(!data) ||
- !data->critical_start || atomic_read(&data->disabled))
+ !data->critical_start || local_read(&data->disabled))
return;
- atomic_inc(&data->disabled);
+ disabled = local_inc_return(&data->disabled);
- trace_ctx = tracing_gen_ctx();
- __trace_function(tr, ip, parent_ip, trace_ctx);
- check_critical_timing(tr, data, parent_ip ? : ip, cpu);
- data->critical_start = 0;
- atomic_dec(&data->disabled);
+ if (disabled == 1) {
+ trace_ctx = tracing_gen_ctx();
+ __trace_function(tr, ip, parent_ip, trace_ctx);
+ check_critical_timing(tr, data, parent_ip ? : ip, cpu);
+ data->critical_start = 0;
+ }
+
+ local_dec(&data->disabled);
}
/* start and stop critical timings used to for stoppage (in idle) */
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 59857a1ee44c..d7b135de958a 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -96,22 +96,18 @@ static int kdb_ftdump(int argc, const char **argv)
{
int skip_entries = 0;
long cpu_file;
- char *cp;
+ int err;
int cnt;
- int cpu;
if (argc > 2)
return KDB_ARGCOUNT;
- if (argc) {
- skip_entries = simple_strtol(argv[1], &cp, 0);
- if (*cp)
- skip_entries = 0;
- }
+ if (argc && kstrtoint(argv[1], 0, &skip_entries))
+ return KDB_BADINT;
if (argc == 2) {
- cpu_file = simple_strtol(argv[2], &cp, 0);
- if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 ||
+ err = kstrtol(argv[2], 0, &cpu_file);
+ if (err || cpu_file >= NR_CPUS || cpu_file < 0 ||
!cpu_online(cpu_file))
return KDB_BADINT;
} else {
@@ -123,9 +119,7 @@ static int kdb_ftdump(int argc, const char **argv)
trace_init_global_iter(&iter);
iter.buffer_iter = buffer_iter;
- for_each_tracing_cpu(cpu) {
- atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
- }
+ tracer_tracing_disable(iter.tr);
/* A negative skip_entries means skip all but the last entries */
if (skip_entries < 0) {
@@ -138,9 +132,7 @@ static int kdb_ftdump(int argc, const char **argv)
ftrace_dump_buf(skip_entries, cpu_file);
- for_each_tracing_cpu(cpu) {
- atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
- }
+ tracer_tracing_enable(iter.tr);
kdb_trap_printk--;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 14099cc17fc9..3e5c47b6d7b2 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -8,6 +8,7 @@
#define pr_fmt(fmt) "trace_kprobe: " fmt
#include <linux/bpf-cgroup.h>
+#include <linux/cleanup.h>
#include <linux/security.h>
#include <linux/module.h>
#include <linux/uaccess.h>
@@ -111,6 +112,7 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
return strncmp(module_name(mod), name, len) == 0 && name[len] == ':';
}
+#ifdef CONFIG_MODULES
static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
{
char *p;
@@ -122,13 +124,18 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
if (!p)
return true;
*p = '\0';
- rcu_read_lock_sched();
- ret = !!find_module(tk->symbol);
- rcu_read_unlock_sched();
+ scoped_guard(rcu)
+ ret = !!find_module(tk->symbol);
*p = ':';
return ret;
}
+#else
+static inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
+{
+ return false;
+}
+#endif
static bool trace_kprobe_is_busy(struct dyn_event *ev)
{
@@ -250,6 +257,9 @@ static void free_trace_kprobe(struct trace_kprobe *tk)
}
}
+DEFINE_FREE(free_trace_kprobe, struct trace_kprobe *,
+ if (!IS_ERR_OR_NULL(_T)) free_trace_kprobe(_T))
+
/*
* Allocate new trace_probe and initialize it (including kprobes).
*/
@@ -261,7 +271,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
int maxactive,
int nargs, bool is_return)
{
- struct trace_kprobe *tk;
+ struct trace_kprobe *tk __free(free_trace_kprobe) = NULL;
int ret = -ENOMEM;
tk = kzalloc(struct_size(tk, tp.args, nargs), GFP_KERNEL);
@@ -270,12 +280,12 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
tk->nhit = alloc_percpu(unsigned long);
if (!tk->nhit)
- goto error;
+ return ERR_PTR(ret);
if (symbol) {
tk->symbol = kstrdup(symbol, GFP_KERNEL);
if (!tk->symbol)
- goto error;
+ return ERR_PTR(ret);
tk->rp.kp.symbol_name = tk->symbol;
tk->rp.kp.offset = offs;
} else
@@ -292,13 +302,10 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
ret = trace_probe_init(&tk->tp, event, group, false, nargs);
if (ret < 0)
- goto error;
+ return ERR_PTR(ret);
dyn_event_init(&tk->devent, &trace_kprobe_ops);
- return tk;
-error:
- free_trace_kprobe(tk);
- return ERR_PTR(ret);
+ return_ptr(tk);
}
static struct trace_kprobe *find_trace_kprobe(const char *event,
@@ -627,7 +634,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
struct trace_kprobe *old_tk;
int ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
old_tk = find_trace_kprobe(trace_probe_name(&tk->tp),
trace_probe_group_name(&tk->tp));
@@ -635,11 +642,9 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
if (trace_kprobe_is_return(tk) != trace_kprobe_is_return(old_tk)) {
trace_probe_log_set_index(0);
trace_probe_log_err(0, DIFF_PROBE_TYPE);
- ret = -EEXIST;
- } else {
- ret = append_trace_kprobe(tk, old_tk);
+ return -EEXIST;
}
- goto end;
+ return append_trace_kprobe(tk, old_tk);
}
/* Register new event */
@@ -650,7 +655,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
trace_probe_log_err(0, EVENT_EXIST);
} else
pr_warn("Failed to register probe event(%d)\n", ret);
- goto end;
+ return ret;
}
/* Register k*probe */
@@ -665,8 +670,22 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
else
dyn_event_add(&tk->devent, trace_probe_event_call(&tk->tp));
-end:
- mutex_unlock(&event_mutex);
+ return ret;
+}
+
+#ifdef CONFIG_MODULES
+static int validate_module_probe_symbol(const char *modname, const char *symbol);
+
+static int register_module_trace_kprobe(struct module *mod, struct trace_kprobe *tk)
+{
+ const char *p;
+ int ret = 0;
+
+ p = strchr(trace_kprobe_symbol(tk), ':');
+ if (p)
+ ret = validate_module_probe_symbol(module_name(mod), p + 1);
+ if (!ret)
+ ret = __register_trace_kprobe(tk);
return ret;
}
@@ -683,27 +702,36 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
return NOTIFY_DONE;
/* Update probes on coming module */
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
for_each_trace_kprobe(tk, pos) {
if (trace_kprobe_within_module(tk, mod)) {
/* Don't need to check busy - this should have gone. */
__unregister_trace_kprobe(tk);
- ret = __register_trace_kprobe(tk);
+ ret = register_module_trace_kprobe(mod, tk);
if (ret)
pr_warn("Failed to re-register probe %s on %s: %d\n",
trace_probe_name(&tk->tp),
module_name(mod), ret);
}
}
- mutex_unlock(&event_mutex);
return NOTIFY_DONE;
}
static struct notifier_block trace_kprobe_module_nb = {
.notifier_call = trace_kprobe_module_callback,
- .priority = 1 /* Invoked after kprobe module callback */
+ .priority = 2 /* Invoked after kprobe and jump_label module callback */
};
+static int trace_kprobe_register_module_notifier(void)
+{
+ return register_module_notifier(&trace_kprobe_module_nb);
+}
+#else
+static int trace_kprobe_register_module_notifier(void)
+{
+ return 0;
+}
+#endif /* CONFIG_MODULES */
static int count_symbols(void *data, unsigned long unused)
{
@@ -729,21 +757,84 @@ static int count_mod_symbols(void *data, const char *name, unsigned long unused)
return 0;
}
-static unsigned int number_of_same_symbols(char *func_name)
+static unsigned int number_of_same_symbols(const char *mod, const char *func_name)
{
struct sym_count_ctx ctx = { .count = 0, .name = func_name };
- kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count);
+ if (!mod)
+ kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count);
- module_kallsyms_on_each_symbol(NULL, count_mod_symbols, &ctx);
+ module_kallsyms_on_each_symbol(mod, count_mod_symbols, &ctx);
return ctx.count;
}
+static int validate_module_probe_symbol(const char *modname, const char *symbol)
+{
+ unsigned int count = number_of_same_symbols(modname, symbol);
+
+ if (count > 1) {
+ /*
+ * Users should use ADDR to remove the ambiguity of
+ * using KSYM only.
+ */
+ return -EADDRNOTAVAIL;
+ } else if (count == 0) {
+ /*
+ * We can return ENOENT earlier than when register the
+ * kprobe.
+ */
+ return -ENOENT;
+ }
+ return 0;
+}
+
+#ifdef CONFIG_MODULES
+/* Return NULL if the module is not loaded or under unloading. */
+static struct module *try_module_get_by_name(const char *name)
+{
+ struct module *mod;
+
+ guard(rcu)();
+ mod = find_module(name);
+ if (mod && !try_module_get(mod))
+ mod = NULL;
+ return mod;
+}
+#else
+#define try_module_get_by_name(name) (NULL)
+#endif
+
+static int validate_probe_symbol(char *symbol)
+{
+ struct module *mod = NULL;
+ char *modname = NULL, *p;
+ int ret = 0;
+
+ p = strchr(symbol, ':');
+ if (p) {
+ modname = symbol;
+ symbol = p + 1;
+ *p = '\0';
+ mod = try_module_get_by_name(modname);
+ if (!mod)
+ goto out;
+ }
+
+ ret = validate_module_probe_symbol(modname, symbol);
+out:
+ if (p)
+ *p = ':';
+ if (mod)
+ module_put(mod);
+ return ret;
+}
+
static int trace_kprobe_entry_handler(struct kretprobe_instance *ri,
struct pt_regs *regs);
-static int __trace_kprobe_create(int argc, const char *argv[])
+static int trace_kprobe_create_internal(int argc, const char *argv[],
+ struct traceprobe_parse_context *ctx)
{
/*
* Argument syntax:
@@ -769,11 +860,12 @@ static int __trace_kprobe_create(int argc, const char *argv[])
* Type of args:
* FETCHARG:TYPE : use TYPE instead of unsigned long.
*/
- struct trace_kprobe *tk = NULL;
+ struct trace_kprobe *tk __free(free_trace_kprobe) = NULL;
int i, len, new_argc = 0, ret = 0;
bool is_return = false;
- char *symbol = NULL, *tmp = NULL;
- const char **new_argv = NULL;
+ char *symbol __free(kfree) = NULL;
+ char *tmp = NULL;
+ const char **new_argv __free(kfree) = NULL;
const char *event = NULL, *group = KPROBE_EVENT_SYSTEM;
enum probe_print_type ptype;
int maxactive = 0;
@@ -782,7 +874,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
char buf[MAX_EVENT_NAME_LEN];
char gbuf[MAX_EVENT_NAME_LEN];
char abuf[MAX_BTF_ARGS_LEN];
- struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL };
+ char *dbuf __free(kfree) = NULL;
switch (argv[0][0]) {
case 'r':
@@ -796,8 +888,6 @@ static int __trace_kprobe_create(int argc, const char *argv[])
if (argc < 2)
return -ECANCELED;
- trace_probe_log_init("trace_kprobe", argc, argv);
-
event = strchr(&argv[0][1], ':');
if (event)
event++;
@@ -805,7 +895,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
if (isdigit(argv[0][1])) {
if (!is_return) {
trace_probe_log_err(1, BAD_MAXACT_TYPE);
- goto parse_error;
+ return -EINVAL;
}
if (event)
len = event - &argv[0][1] - 1;
@@ -813,21 +903,21 @@ static int __trace_kprobe_create(int argc, const char *argv[])
len = strlen(&argv[0][1]);
if (len > MAX_EVENT_NAME_LEN - 1) {
trace_probe_log_err(1, BAD_MAXACT);
- goto parse_error;
+ return -EINVAL;
}
memcpy(buf, &argv[0][1], len);
buf[len] = '\0';
ret = kstrtouint(buf, 0, &maxactive);
if (ret || !maxactive) {
trace_probe_log_err(1, BAD_MAXACT);
- goto parse_error;
+ return -EINVAL;
}
/* kretprobes instances are iterated over via a list. The
* maximum should stay reasonable.
*/
if (maxactive > KRETPROBE_MAXACTIVE_MAX) {
trace_probe_log_err(1, MAXACT_TOO_BIG);
- goto parse_error;
+ return -EINVAL;
}
}
@@ -836,10 +926,9 @@ static int __trace_kprobe_create(int argc, const char *argv[])
if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) {
trace_probe_log_set_index(1);
/* Check whether uprobe event specified */
- if (strchr(argv[1], '/') && strchr(argv[1], ':')) {
- ret = -ECANCELED;
- goto error;
- }
+ if (strchr(argv[1], '/') && strchr(argv[1], ':'))
+ return -ECANCELED;
+
/* a symbol specified */
symbol = kstrdup(argv[1], GFP_KERNEL);
if (!symbol)
@@ -852,7 +941,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
is_return = true;
} else {
trace_probe_log_err(tmp - symbol, BAD_ADDR_SUFFIX);
- goto parse_error;
+ return -EINVAL;
}
}
@@ -860,42 +949,25 @@ static int __trace_kprobe_create(int argc, const char *argv[])
ret = traceprobe_split_symbol_offset(symbol, &offset);
if (ret || offset < 0 || offset > UINT_MAX) {
trace_probe_log_err(0, BAD_PROBE_ADDR);
- goto parse_error;
+ return -EINVAL;
+ }
+ ret = validate_probe_symbol(symbol);
+ if (ret) {
+ if (ret == -EADDRNOTAVAIL)
+ trace_probe_log_err(0, NON_UNIQ_SYMBOL);
+ else
+ trace_probe_log_err(0, BAD_PROBE_ADDR);
+ return -EINVAL;
}
if (is_return)
- ctx.flags |= TPARG_FL_RETURN;
+ ctx->flags |= TPARG_FL_RETURN;
ret = kprobe_on_func_entry(NULL, symbol, offset);
if (ret == 0 && !is_return)
- ctx.flags |= TPARG_FL_FENTRY;
+ ctx->flags |= TPARG_FL_FENTRY;
/* Defer the ENOENT case until register kprobe */
if (ret == -EINVAL && is_return) {
trace_probe_log_err(0, BAD_RETPROBE);
- goto parse_error;
- }
- }
-
- if (symbol && !strchr(symbol, ':')) {
- unsigned int count;
-
- count = number_of_same_symbols(symbol);
- if (count > 1) {
- /*
- * Users should use ADDR to remove the ambiguity of
- * using KSYM only.
- */
- trace_probe_log_err(0, NON_UNIQ_SYMBOL);
- ret = -EADDRNOTAVAIL;
-
- goto error;
- } else if (count == 0) {
- /*
- * We can return ENOENT earlier than when register the
- * kprobe.
- */
- trace_probe_log_err(0, BAD_PROBE_ADDR);
- ret = -ENOENT;
-
- goto error;
+ return -EINVAL;
}
}
@@ -904,7 +976,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
if (ret)
- goto parse_error;
+ return ret;
}
if (!event) {
@@ -920,18 +992,27 @@ static int __trace_kprobe_create(int argc, const char *argv[])
}
argc -= 2; argv += 2;
- ctx.funcname = symbol;
+ ctx->funcname = symbol;
new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc,
- abuf, MAX_BTF_ARGS_LEN, &ctx);
+ abuf, MAX_BTF_ARGS_LEN, ctx);
if (IS_ERR(new_argv)) {
ret = PTR_ERR(new_argv);
new_argv = NULL;
- goto out;
+ return ret;
}
if (new_argv) {
argc = new_argc;
argv = new_argv;
}
+ if (argc > MAX_TRACE_ARGS) {
+ trace_probe_log_set_index(2);
+ trace_probe_log_err(0, TOO_MANY_ARGS);
+ return -E2BIG;
+ }
+
+ ret = traceprobe_expand_dentry_args(argc, argv, &dbuf);
+ if (ret)
+ return ret;
/* setup a probe */
tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive,
@@ -940,16 +1021,16 @@ static int __trace_kprobe_create(int argc, const char *argv[])
ret = PTR_ERR(tk);
/* This must return -ENOMEM, else there is a bug */
WARN_ON_ONCE(ret != -ENOMEM);
- goto out; /* We know tk is not allocated */
+ return ret; /* We know tk is not allocated */
}
/* parse arguments */
- for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ for (i = 0; i < argc; i++) {
trace_probe_log_set_index(i + 2);
- ctx.offset = 0;
- ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], &ctx);
+ ctx->offset = 0;
+ ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], ctx);
if (ret)
- goto error; /* This can be -ENOMEM */
+ return ret; /* This can be -ENOMEM */
}
/* entry handler for kretprobe */
if (is_return && tk->tp.entry_arg) {
@@ -960,7 +1041,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
ptype = is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
ret = traceprobe_set_print_fmt(&tk->tp, ptype);
if (ret < 0)
- goto error;
+ return ret;
ret = register_trace_kprobe(tk);
if (ret) {
@@ -971,26 +1052,34 @@ static int __trace_kprobe_create(int argc, const char *argv[])
trace_probe_log_err(0, BAD_PROBE_ADDR);
else if (ret != -ENOMEM && ret != -EEXIST)
trace_probe_log_err(0, FAIL_REG_PROBE);
- goto error;
+ return ret;
}
+ /*
+ * Here, 'tk' has been registered to the list successfully,
+ * so we don't need to free it.
+ */
+ tk = NULL;
+
+ return 0;
+}
+
+static int trace_kprobe_create_cb(int argc, const char *argv[])
+{
+ struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL };
+ int ret;
+
+ trace_probe_log_init("trace_kprobe", argc, argv);
+
+ ret = trace_kprobe_create_internal(argc, argv, &ctx);
-out:
traceprobe_finish_parse(&ctx);
trace_probe_log_clear();
- kfree(new_argv);
- kfree(symbol);
return ret;
-
-parse_error:
- ret = -EINVAL;
-error:
- free_trace_kprobe(tk);
- goto out;
}
static int trace_kprobe_create(const char *raw_command)
{
- return trace_probe_create(raw_command, __trace_kprobe_create);
+ return trace_probe_create(raw_command, trace_kprobe_create_cb);
}
static int create_or_delete_trace_kprobe(const char *raw_command)
@@ -1000,7 +1089,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command)
if (raw_command[0] == '-')
return dyn_event_release(raw_command, &trace_kprobe_ops);
- ret = trace_kprobe_create(raw_command);
+ ret = dyn_event_create(raw_command, &trace_kprobe_ops);
return ret == -ECANCELED ? -EINVAL : ret;
}
@@ -1806,26 +1895,14 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
bool is_return)
{
enum probe_print_type ptype;
- struct trace_kprobe *tk;
+ struct trace_kprobe *tk __free(free_trace_kprobe) = NULL;
int ret;
char *event;
if (func) {
- unsigned int count;
-
- count = number_of_same_symbols(func);
- if (count > 1)
- /*
- * Users should use addr to remove the ambiguity of
- * using func only.
- */
- return ERR_PTR(-EADDRNOTAVAIL);
- else if (count == 0)
- /*
- * We can return ENOENT earlier than when register the
- * kprobe.
- */
- return ERR_PTR(-ENOENT);
+ ret = validate_probe_symbol(func);
+ if (ret)
+ return ERR_PTR(ret);
}
/*
@@ -1849,19 +1926,14 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
ptype = trace_kprobe_is_return(tk) ?
PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
- if (traceprobe_set_print_fmt(&tk->tp, ptype) < 0) {
- ret = -ENOMEM;
- goto error;
- }
+ if (traceprobe_set_print_fmt(&tk->tp, ptype) < 0)
+ return ERR_PTR(-ENOMEM);
ret = __register_trace_kprobe(tk);
if (ret < 0)
- goto error;
+ return ERR_PTR(ret);
- return trace_probe_event_call(&tk->tp);
-error:
- free_trace_kprobe(tk);
- return ERR_PTR(ret);
+ return trace_probe_event_call(&(no_free_ptr(tk)->tp));
}
void destroy_local_trace_kprobe(struct trace_event_call *event_call)
@@ -1890,13 +1962,12 @@ static __init void enable_boot_kprobe_events(void)
struct trace_kprobe *tk;
struct dyn_event *pos;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
for_each_trace_kprobe(tk, pos) {
list_for_each_entry(file, &tr->events, list)
if (file->event_call == trace_probe_event_call(&tk->tp))
trace_event_enable_disable(file, 1, 0);
}
- mutex_unlock(&event_mutex);
}
static __init void setup_boot_kprobe_events(void)
@@ -1933,7 +2004,7 @@ static __init int init_kprobe_trace_early(void)
if (ret)
return ret;
- if (register_module_notifier(&trace_kprobe_module_nb))
+ if (trace_kprobe_register_module_notifier())
return -EINVAL;
return 0;
@@ -1999,19 +2070,16 @@ static __init int kprobe_trace_self_tests_init(void)
pr_info("Testing kprobe tracing: ");
ret = create_or_delete_trace_kprobe("p:testprobe kprobe_trace_selftest_target $stack $stack0 +0($stack)");
- if (WARN_ON_ONCE(ret)) {
- pr_warn("error on probing function entry.\n");
+ if (WARN_ONCE(ret, "error on probing function entry.")) {
warn++;
} else {
/* Enable trace point */
tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
- if (WARN_ON_ONCE(tk == NULL)) {
- pr_warn("error on getting new probe.\n");
+ if (WARN_ONCE(tk == NULL, "error on probing function entry.")) {
warn++;
} else {
file = find_trace_probe_file(tk, top_trace_array());
- if (WARN_ON_ONCE(file == NULL)) {
- pr_warn("error on getting probe file.\n");
+ if (WARN_ONCE(file == NULL, "error on getting probe file.")) {
warn++;
} else
enable_trace_kprobe(
@@ -2020,19 +2088,16 @@ static __init int kprobe_trace_self_tests_init(void)
}
ret = create_or_delete_trace_kprobe("r:testprobe2 kprobe_trace_selftest_target $retval");
- if (WARN_ON_ONCE(ret)) {
- pr_warn("error on probing function return.\n");
+ if (WARN_ONCE(ret, "error on probing function return.")) {
warn++;
} else {
/* Enable trace point */
tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM);
- if (WARN_ON_ONCE(tk == NULL)) {
- pr_warn("error on getting 2nd new probe.\n");
+ if (WARN_ONCE(tk == NULL, "error on getting 2nd new probe.")) {
warn++;
} else {
file = find_trace_probe_file(tk, top_trace_array());
- if (WARN_ON_ONCE(file == NULL)) {
- pr_warn("error on getting probe file.\n");
+ if (WARN_ONCE(file == NULL, "error on getting probe file.")) {
warn++;
} else
enable_trace_kprobe(
@@ -2055,18 +2120,15 @@ static __init int kprobe_trace_self_tests_init(void)
/* Disable trace points before removing it */
tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
- if (WARN_ON_ONCE(tk == NULL)) {
- pr_warn("error on getting test probe.\n");
+ if (WARN_ONCE(tk == NULL, "error on getting test probe.")) {
warn++;
} else {
- if (trace_kprobe_nhit(tk) != 1) {
- pr_warn("incorrect number of testprobe hits\n");
+ if (WARN_ONCE(trace_kprobe_nhit(tk) != 1,
+ "incorrect number of testprobe hits."))
warn++;
- }
file = find_trace_probe_file(tk, top_trace_array());
- if (WARN_ON_ONCE(file == NULL)) {
- pr_warn("error on getting probe file.\n");
+ if (WARN_ONCE(file == NULL, "error on getting probe file.")) {
warn++;
} else
disable_trace_kprobe(
@@ -2074,18 +2136,15 @@ static __init int kprobe_trace_self_tests_init(void)
}
tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM);
- if (WARN_ON_ONCE(tk == NULL)) {
- pr_warn("error on getting 2nd test probe.\n");
+ if (WARN_ONCE(tk == NULL, "error on getting 2nd test probe.")) {
warn++;
} else {
- if (trace_kprobe_nhit(tk) != 1) {
- pr_warn("incorrect number of testprobe2 hits\n");
+ if (WARN_ONCE(trace_kprobe_nhit(tk) != 1,
+ "incorrect number of testprobe2 hits."))
warn++;
- }
file = find_trace_probe_file(tk, top_trace_array());
- if (WARN_ON_ONCE(file == NULL)) {
- pr_warn("error on getting probe file.\n");
+ if (WARN_ONCE(file == NULL, "error on getting probe file.")) {
warn++;
} else
disable_trace_kprobe(
@@ -2093,23 +2152,15 @@ static __init int kprobe_trace_self_tests_init(void)
}
ret = create_or_delete_trace_kprobe("-:testprobe");
- if (WARN_ON_ONCE(ret)) {
- pr_warn("error on deleting a probe.\n");
+ if (WARN_ONCE(ret, "error on deleting a probe."))
warn++;
- }
ret = create_or_delete_trace_kprobe("-:testprobe2");
- if (WARN_ON_ONCE(ret)) {
- pr_warn("error on deleting a probe.\n");
+ if (WARN_ONCE(ret, "error on deleting a probe."))
warn++;
- }
+
end:
- ret = dyn_events_release_all(&trace_kprobe_ops);
- if (WARN_ON_ONCE(ret)) {
- pr_warn("error on cleaning up probes.\n");
- warn++;
- }
/*
* Wait for the optimizer work to finish. Otherwise it might fiddle
* with probes in already freed __init text.
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 64e77b513697..c706544be60c 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -291,10 +291,8 @@ __init static int init_mmio_trace(void)
device_initcall(init_mmio_trace);
static void __trace_mmiotrace_rw(struct trace_array *tr,
- struct trace_array_cpu *data,
struct mmiotrace_rw *rw)
{
- struct trace_event_call *call = &event_mmiotrace_rw;
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_rw *entry;
@@ -310,22 +308,18 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->rw = *rw;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
+ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
}
void mmio_trace_rw(struct mmiotrace_rw *rw)
{
struct trace_array *tr = mmio_trace_array;
- struct trace_array_cpu *data = per_cpu_ptr(tr->array_buffer.data, smp_processor_id());
- __trace_mmiotrace_rw(tr, data, rw);
+ __trace_mmiotrace_rw(tr, rw);
}
static void __trace_mmiotrace_map(struct trace_array *tr,
- struct trace_array_cpu *data,
struct mmiotrace_map *map)
{
- struct trace_event_call *call = &event_mmiotrace_map;
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_map *entry;
@@ -341,19 +335,13 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->map = *map;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
+ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
}
void mmio_trace_mapping(struct mmiotrace_map *map)
{
struct trace_array *tr = mmio_trace_array;
- struct trace_array_cpu *data;
-
- preempt_disable();
- data = per_cpu_ptr(tr->array_buffer.data, smp_processor_id());
- __trace_mmiotrace_map(tr, data, map);
- preempt_enable();
+ __trace_mmiotrace_map(tr, map);
}
int mmio_trace_printk(const char *fmt, va_list args)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index a8e28f9b9271..6819b93309ce 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -228,6 +228,11 @@ static inline struct osnoise_variables *this_cpu_osn_var(void)
return this_cpu_ptr(&per_cpu_osnoise_var);
}
+/*
+ * Protect the interface.
+ */
+static struct mutex interface_lock;
+
#ifdef CONFIG_TIMERLAT_TRACER
/*
* Runtime information for the timer mode.
@@ -259,14 +264,20 @@ static inline void tlat_var_reset(void)
{
struct timerlat_variables *tlat_var;
int cpu;
+
+ /* Synchronize with the timerlat interfaces */
+ mutex_lock(&interface_lock);
/*
* So far, all the values are initialized as 0, so
* zeroing the structure is perfect.
*/
for_each_cpu(cpu, cpu_online_mask) {
tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu);
+ if (tlat_var->kthread)
+ hrtimer_cancel(&tlat_var->timer);
memset(tlat_var, 0, sizeof(*tlat_var));
}
+ mutex_unlock(&interface_lock);
}
#else /* CONFIG_TIMERLAT_TRACER */
#define tlat_var_reset() do {} while (0)
@@ -305,38 +316,6 @@ static inline void osn_var_reset_all(void)
bool trace_osnoise_callback_enabled;
/*
- * osnoise sample structure definition. Used to store the statistics of a
- * sample run.
- */
-struct osnoise_sample {
- u64 runtime; /* runtime */
- u64 noise; /* noise */
- u64 max_sample; /* max single noise sample */
- int hw_count; /* # HW (incl. hypervisor) interference */
- int nmi_count; /* # NMIs during this sample */
- int irq_count; /* # IRQs during this sample */
- int softirq_count; /* # softirqs during this sample */
- int thread_count; /* # threads during this sample */
-};
-
-#ifdef CONFIG_TIMERLAT_TRACER
-/*
- * timerlat sample structure definition. Used to store the statistics of
- * a sample run.
- */
-struct timerlat_sample {
- u64 timer_latency; /* timer_latency */
- unsigned int seqnum; /* unique sequence */
- int context; /* timer context */
-};
-#endif
-
-/*
- * Protect the interface.
- */
-static struct mutex interface_lock;
-
-/*
* Tracer data.
*/
static struct osnoise_data {
@@ -491,9 +470,8 @@ static void print_osnoise_headers(struct seq_file *s)
* Record an osnoise_sample into the tracer buffer.
*/
static void
-__trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer)
+__record_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer)
{
- struct trace_event_call *call = &event_osnoise;
struct ring_buffer_event *event;
struct osnoise_entry *entry;
@@ -511,22 +489,23 @@ __trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffe
entry->softirq_count = sample->softirq_count;
entry->thread_count = sample->thread_count;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit_nostack(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
}
/*
- * Record an osnoise_sample on all osnoise instances.
+ * Record an osnoise_sample on all osnoise instances and fire trace event.
*/
-static void trace_osnoise_sample(struct osnoise_sample *sample)
+static void record_osnoise_sample(struct osnoise_sample *sample)
{
struct osnoise_instance *inst;
struct trace_buffer *buffer;
+ trace_osnoise_sample(sample);
+
rcu_read_lock();
list_for_each_entry_rcu(inst, &osnoise_instances, list) {
buffer = inst->tr->array_buffer.buffer;
- __trace_osnoise_sample(sample, buffer);
+ __record_osnoise_sample(sample, buffer);
}
rcu_read_unlock();
}
@@ -570,9 +549,8 @@ static void print_timerlat_headers(struct seq_file *s)
#endif /* CONFIG_PREEMPT_RT */
static void
-__trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer)
+__record_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer)
{
- struct trace_event_call *call = &event_osnoise;
struct ring_buffer_event *event;
struct timerlat_entry *entry;
@@ -585,22 +563,23 @@ __trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buf
entry->context = sample->context;
entry->timer_latency = sample->timer_latency;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit_nostack(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
}
/*
* Record an timerlat_sample into the tracer buffer.
*/
-static void trace_timerlat_sample(struct timerlat_sample *sample)
+static void record_timerlat_sample(struct timerlat_sample *sample)
{
struct osnoise_instance *inst;
struct trace_buffer *buffer;
+ trace_timerlat_sample(sample);
+
rcu_read_lock();
list_for_each_entry_rcu(inst, &osnoise_instances, list) {
buffer = inst->tr->array_buffer.buffer;
- __trace_timerlat_sample(sample, buffer);
+ __record_timerlat_sample(sample, buffer);
}
rcu_read_unlock();
}
@@ -648,7 +627,6 @@ static void timerlat_save_stack(int skip)
static void
__timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, unsigned int size)
{
- struct trace_event_call *call = &event_osnoise;
struct ring_buffer_event *event;
struct stack_entry *entry;
@@ -662,8 +640,7 @@ __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, u
memcpy(&entry->caller, fstack->calls, size);
entry->size = fstack->nr_entries;
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit_nostack(buffer, event);
+ trace_buffer_unlock_commit_nostack(buffer, event);
}
/*
@@ -1229,6 +1206,8 @@ static void trace_sched_migrate_callback(void *data, struct task_struct *p, int
}
}
+static bool monitor_enabled;
+
static int register_migration_monitor(void)
{
int ret = 0;
@@ -1237,16 +1216,25 @@ static int register_migration_monitor(void)
* Timerlat thread migration check is only required when running timerlat in user-space.
* Thus, enable callback only if timerlat is set with no workload.
*/
- if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options))
+ if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) {
+ if (WARN_ON_ONCE(monitor_enabled))
+ return 0;
+
ret = register_trace_sched_migrate_task(trace_sched_migrate_callback, NULL);
+ if (!ret)
+ monitor_enabled = true;
+ }
return ret;
}
static void unregister_migration_monitor(void)
{
- if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options))
- unregister_trace_sched_migrate_task(trace_sched_migrate_callback, NULL);
+ if (!monitor_enabled)
+ return;
+
+ unregister_trace_sched_migrate_task(trace_sched_migrate_callback, NULL);
+ monitor_enabled = false;
}
#else
static int register_migration_monitor(void)
@@ -1444,9 +1432,9 @@ static int run_osnoise(void)
save_osn_sample_stats(osn_var, &s);
/*
- * if threshold is 0, use the default value of 5 us.
+ * if threshold is 0, use the default value of 1 us.
*/
- threshold = tracing_thresh ? : 5000;
+ threshold = tracing_thresh ? : 1000;
/*
* Apply PREEMPT and IRQ disabled options.
@@ -1531,27 +1519,25 @@ static int run_osnoise(void)
/*
* In some cases, notably when running on a nohz_full CPU with
- * a stopped tick PREEMPT_RCU has no way to account for QSs.
- * This will eventually cause unwarranted noise as PREEMPT_RCU
- * will force preemption as the means of ending the current
- * grace period. We avoid this problem by calling
- * rcu_momentary_dyntick_idle(), which performs a zero duration
- * EQS allowing PREEMPT_RCU to end the current grace period.
- * This call shouldn't be wrapped inside an RCU critical
- * section.
+ * a stopped tick PREEMPT_RCU or PREEMPT_LAZY have no way to
+ * account for QSs. This will eventually cause unwarranted
+ * noise as RCU forces preemption as the means of ending the
+ * current grace period. We avoid this by calling
+ * rcu_momentary_eqs(), which performs a zero duration EQS
+ * allowing RCU to end the current grace period. This call
+ * shouldn't be wrapped inside an RCU critical section.
*
- * Note that in non PREEMPT_RCU kernels QSs are handled through
- * cond_resched()
+ * Normally QSs for other cases are handled through cond_resched().
+ * For simplicity, however, we call rcu_momentary_eqs() for all
+ * configurations here.
*/
- if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {
- if (!disable_irq)
- local_irq_disable();
+ if (!disable_irq)
+ local_irq_disable();
- rcu_momentary_dyntick_idle();
+ rcu_momentary_eqs();
- if (!disable_irq)
- local_irq_enable();
- }
+ if (!disable_irq)
+ local_irq_enable();
/*
* For the non-preemptive kernel config: let threads runs, if
@@ -1597,7 +1583,7 @@ static int run_osnoise(void)
/* Save interference stats info */
diff_osn_sample_stats(osn_var, &s);
- trace_osnoise_sample(&s);
+ record_osnoise_sample(&s);
notify_new_max_latency(max_noise);
@@ -1612,6 +1598,7 @@ out:
static struct cpumask osnoise_cpumask;
static struct cpumask save_cpumask;
+static struct cpumask kthread_cpumask;
/*
* osnoise_sleep - sleep until the next period
@@ -1675,6 +1662,7 @@ static inline int osnoise_migration_pending(void)
*/
mutex_lock(&interface_lock);
this_cpu_osn_var()->kthread = NULL;
+ cpumask_clear_cpu(smp_processor_id(), &kthread_cpumask);
mutex_unlock(&interface_lock);
return 1;
@@ -1790,7 +1778,7 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer)
s.timer_latency = diff;
s.context = IRQ_CONTEXT;
- trace_timerlat_sample(&s);
+ record_timerlat_sample(&s);
if (osnoise_data.stop_tracing) {
if (time_to_us(diff) >= osnoise_data.stop_tracing) {
@@ -1888,8 +1876,7 @@ static int timerlat_main(void *data)
tlat->count = 0;
tlat->tracing_thread = false;
- hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
- tlat->timer.function = timerlat_irq;
+ hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
tlat->kthread = current;
osn_var->pid = current->pid;
/*
@@ -1910,7 +1897,7 @@ static int timerlat_main(void *data)
s.timer_latency = diff;
s.context = THREAD_CONTEXT;
- trace_timerlat_sample(&s);
+ record_timerlat_sample(&s);
notify_new_max_latency(diff);
@@ -1945,11 +1932,12 @@ static void stop_kthread(unsigned int cpu)
{
struct task_struct *kthread;
- kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread;
+ kthread = xchg_relaxed(&(per_cpu(per_cpu_osnoise_var, cpu).kthread), NULL);
if (kthread) {
- if (test_bit(OSN_WORKLOAD, &osnoise_options)) {
+ if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask) &&
+ !WARN_ON(!test_bit(OSN_WORKLOAD, &osnoise_options))) {
kthread_stop(kthread);
- } else {
+ } else if (!WARN_ON(test_bit(OSN_WORKLOAD, &osnoise_options))) {
/*
* This is a user thread waiting on the timerlat_fd. We need
* to close all users, and the best way to guarantee this is
@@ -1958,7 +1946,6 @@ static void stop_kthread(unsigned int cpu)
kill_pid(kthread->thread_pid, SIGKILL, 1);
put_task_struct(kthread);
}
- per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
} else {
/* if no workload, just return */
if (!test_bit(OSN_WORKLOAD, &osnoise_options)) {
@@ -1967,7 +1954,6 @@ static void stop_kthread(unsigned int cpu)
*/
per_cpu(per_cpu_osnoise_var, cpu).sampling = false;
barrier();
- return;
}
}
}
@@ -1999,6 +1985,10 @@ static int start_kthread(unsigned int cpu)
void *main = osnoise_main;
char comm[24];
+ /* Do not start a new thread if it is already running */
+ if (per_cpu(per_cpu_osnoise_var, cpu).kthread)
+ return 0;
+
if (timerlat_enabled()) {
snprintf(comm, 24, "timerlat/%d", cpu);
main = timerlat_main;
@@ -2016,11 +2006,11 @@ static int start_kthread(unsigned int cpu)
if (IS_ERR(kthread)) {
pr_err(BANNER "could not start sampling thread\n");
- stop_per_cpu_kthreads();
return -ENOMEM;
}
per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread;
+ cpumask_set_cpu(cpu, &kthread_cpumask);
return 0;
}
@@ -2048,8 +2038,15 @@ static int start_per_cpu_kthreads(void)
*/
cpumask_and(current_mask, cpu_online_mask, &osnoise_cpumask);
- for_each_possible_cpu(cpu)
- per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
+ for_each_possible_cpu(cpu) {
+ if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask)) {
+ struct task_struct *kthread;
+
+ kthread = xchg_relaxed(&(per_cpu(per_cpu_osnoise_var, cpu).kthread), NULL);
+ if (!WARN_ON(!kthread))
+ kthread_stop(kthread);
+ }
+ }
for_each_cpu(cpu, current_mask) {
retval = start_kthread(cpu);
@@ -2070,24 +2067,21 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy)
{
unsigned int cpu = smp_processor_id();
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
if (!osnoise_has_registered_instances())
- goto out_unlock_trace;
+ return;
- mutex_lock(&interface_lock);
- cpus_read_lock();
+ guard(mutex)(&interface_lock);
+ guard(cpus_read_lock)();
+
+ if (!cpu_online(cpu))
+ return;
if (!cpumask_test_cpu(cpu, &osnoise_cpumask))
- goto out_unlock;
+ return;
start_kthread(cpu);
-
-out_unlock:
- cpus_read_unlock();
- mutex_unlock(&interface_lock);
-out_unlock_trace:
- mutex_unlock(&trace_types_lock);
}
static DECLARE_WORK(osnoise_hotplug_work, osnoise_hotplug_workfn);
@@ -2285,31 +2279,22 @@ static ssize_t
osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count,
loff_t *ppos)
{
- char *mask_str;
+ char *mask_str __free(kfree) = NULL;
int len;
- mutex_lock(&interface_lock);
+ guard(mutex)(&interface_lock);
len = snprintf(NULL, 0, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)) + 1;
mask_str = kmalloc(len, GFP_KERNEL);
- if (!mask_str) {
- count = -ENOMEM;
- goto out_unlock;
- }
+ if (!mask_str)
+ return -ENOMEM;
len = snprintf(mask_str, len, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask));
- if (len >= count) {
- count = -EINVAL;
- goto out_free;
- }
+ if (len >= count)
+ return -EINVAL;
count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
-out_free:
- kfree(mask_str);
-out_unlock:
- mutex_unlock(&interface_lock);
-
return count;
}
@@ -2317,7 +2302,7 @@ out_unlock:
* osnoise_cpus_write - Write function for "cpus" entry
* @filp: The active open file structure
* @ubuf: The user buffer that contains the value to write
- * @cnt: The maximum number of bytes to write to "file"
+ * @count: The maximum number of bytes to write to "file"
* @ppos: The current position in @file
*
* This function provides a write implementation for the "cpus"
@@ -2335,10 +2320,11 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count,
{
cpumask_var_t osnoise_cpumask_new;
int running, err;
- char buf[256];
+ char *buf __free(kfree) = NULL;
- if (count >= 256)
- return -EINVAL;
+ buf = kmalloc(count, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
if (copy_from_user(buf, ubuf, count))
return -EFAULT;
@@ -2444,8 +2430,7 @@ static int timerlat_fd_open(struct inode *inode, struct file *file)
tlat = this_cpu_tmr_var();
tlat->count = 0;
- hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
- tlat->timer.function = timerlat_irq;
+ hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
migrate_enable();
return 0;
@@ -2517,7 +2502,7 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count,
s.timer_latency = diff;
s.context = THREAD_URET;
- trace_timerlat_sample(&s);
+ record_timerlat_sample(&s);
notify_new_max_latency(diff);
@@ -2552,7 +2537,7 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count,
s.timer_latency = diff;
s.context = THREAD_CONTEXT;
- trace_timerlat_sample(&s);
+ record_timerlat_sample(&s);
if (osnoise_data.stop_tracing_total) {
if (time_to_us(diff) >= osnoise_data.stop_tracing_total) {
@@ -2579,7 +2564,8 @@ static int timerlat_fd_release(struct inode *inode, struct file *file)
osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu);
tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu);
- hrtimer_cancel(&tlat_var->timer);
+ if (tlat_var->kthread)
+ hrtimer_cancel(&tlat_var->timer);
memset(tlat_var, 0, sizeof(*tlat_var));
osn_var->sampling = 0;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index d8b302d01083..0b3db02030a7 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -5,6 +5,7 @@
* Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
*
*/
+#include "trace.h"
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/ftrace.h>
@@ -12,15 +13,19 @@
#include <linux/sched/clock.h>
#include <linux/sched/mm.h>
#include <linux/idr.h>
+#include <linux/btf.h>
+#include <linux/bpf.h>
+#include <linux/hashtable.h>
#include "trace_output.h"
+#include "trace_btf.h"
-/* must be a power of 2 */
-#define EVENT_HASHSIZE 128
+/* 2^7 = 128 */
+#define EVENT_HASH_BITS 7
DECLARE_RWSEM(trace_event_sem);
-static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
+static DEFINE_HASHTABLE(event_hash, EVENT_HASH_BITS);
enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
{
@@ -317,10 +322,14 @@ EXPORT_SYMBOL(trace_raw_output_prep);
void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...)
{
+ struct trace_seq *s = &iter->seq;
va_list ap;
+ if (ignore_event(iter))
+ return;
+
va_start(ap, fmt);
- trace_check_vprintf(iter, trace_event_format(iter, fmt), ap);
+ trace_seq_vprintf(s, trace_event_format(iter, fmt), ap);
va_end(ap);
}
EXPORT_SYMBOL(trace_event_printf);
@@ -460,20 +469,31 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
(entry->flags & TRACE_FLAG_IRQS_OFF && bh_off) ? 'D' :
(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
bh_off ? 'b' :
- (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
'.';
- switch (entry->flags & (TRACE_FLAG_NEED_RESCHED |
+ switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY |
TRACE_FLAG_PREEMPT_RESCHED)) {
+ case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED:
+ need_resched = 'B';
+ break;
case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED:
need_resched = 'N';
break;
+ case TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED:
+ need_resched = 'L';
+ break;
+ case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY:
+ need_resched = 'b';
+ break;
case TRACE_FLAG_NEED_RESCHED:
need_resched = 'n';
break;
case TRACE_FLAG_PREEMPT_RESCHED:
need_resched = 'p';
break;
+ case TRACE_FLAG_NEED_RESCHED_LAZY:
+ need_resched = 'l';
+ break;
default:
need_resched = '.';
break;
@@ -669,6 +689,88 @@ int trace_print_lat_context(struct trace_iterator *iter)
return !trace_seq_has_overflowed(s);
}
+#ifdef CONFIG_FUNCTION_TRACE_ARGS
+void print_function_args(struct trace_seq *s, unsigned long *args,
+ unsigned long func)
+{
+ const struct btf_param *param;
+ const struct btf_type *t;
+ const char *param_name;
+ char name[KSYM_NAME_LEN];
+ unsigned long arg;
+ struct btf *btf;
+ s32 tid, nr = 0;
+ int a, p, x;
+
+ trace_seq_printf(s, "(");
+
+ if (!args)
+ goto out;
+ if (lookup_symbol_name(func, name))
+ goto out;
+
+ /* TODO: Pass module name here too */
+ t = btf_find_func_proto(name, &btf);
+ if (IS_ERR_OR_NULL(t))
+ goto out;
+
+ param = btf_get_func_param(t, &nr);
+ if (!param)
+ goto out_put;
+
+ for (a = 0, p = 0; p < nr; a++, p++) {
+ if (p)
+ trace_seq_puts(s, ", ");
+
+ /* This only prints what the arch allows (6 args by default) */
+ if (a == FTRACE_REGS_MAX_ARGS) {
+ trace_seq_puts(s, "...");
+ break;
+ }
+
+ arg = args[a];
+
+ param_name = btf_name_by_offset(btf, param[p].name_off);
+ if (param_name)
+ trace_seq_printf(s, "%s=", param_name);
+ t = btf_type_skip_modifiers(btf, param[p].type, &tid);
+
+ switch (t ? BTF_INFO_KIND(t->info) : BTF_KIND_UNKN) {
+ case BTF_KIND_UNKN:
+ trace_seq_putc(s, '?');
+ /* Still print unknown type values */
+ fallthrough;
+ case BTF_KIND_PTR:
+ trace_seq_printf(s, "0x%lx", arg);
+ break;
+ case BTF_KIND_INT:
+ trace_seq_printf(s, "%ld", arg);
+ break;
+ case BTF_KIND_ENUM:
+ trace_seq_printf(s, "%ld", arg);
+ break;
+ default:
+ /* This does not handle complex arguments */
+ trace_seq_printf(s, "(%s)[0x%lx", btf_type_str(t), arg);
+ for (x = sizeof(long); x < t->size; x += sizeof(long)) {
+ trace_seq_putc(s, ':');
+ if (++a == FTRACE_REGS_MAX_ARGS) {
+ trace_seq_puts(s, "...]");
+ goto out_put;
+ }
+ trace_seq_printf(s, "0x%lx", args[a]);
+ }
+ trace_seq_putc(s, ']');
+ break;
+ }
+ }
+out_put:
+ btf_put(btf);
+out:
+ trace_seq_printf(s, ")");
+}
+#endif
+
/**
* ftrace_find_event - find a registered event
* @type: the type of event to look for
@@ -679,11 +781,8 @@ int trace_print_lat_context(struct trace_iterator *iter)
struct trace_event *ftrace_find_event(int type)
{
struct trace_event *event;
- unsigned key;
-
- key = type & (EVENT_HASHSIZE - 1);
- hlist_for_each_entry(event, &event_hash[key], node) {
+ hash_for_each_possible(event_hash, event, node, type) {
if (event->type == type)
return event;
}
@@ -738,7 +837,6 @@ void trace_event_read_unlock(void)
*/
int register_trace_event(struct trace_event *event)
{
- unsigned key;
int ret = 0;
down_write(&trace_event_sem);
@@ -771,9 +869,7 @@ int register_trace_event(struct trace_event *event)
if (event->funcs->binary == NULL)
event->funcs->binary = trace_nop_print;
- key = event->type & (EVENT_HASHSIZE - 1);
-
- hlist_add_head(&event->node, &event_hash[key]);
+ hash_add(event_hash, &event->node, event->type);
ret = event->type;
out:
@@ -788,7 +884,7 @@ EXPORT_SYMBOL_GPL(register_trace_event);
*/
int __unregister_trace_event(struct trace_event *event)
{
- hlist_del(&event->node);
+ hash_del(&event->node);
free_trace_event_type(event->type);
return 0;
}
@@ -842,6 +938,9 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c
struct list_head *head)
{
struct ftrace_event_field *field;
+ struct trace_array *tr = iter->tr;
+ unsigned long long laddr;
+ unsigned long addr;
int offset;
int len;
int ret;
@@ -878,8 +977,8 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c
case FILTER_PTR_STRING:
if (!iter->fmt_size)
trace_iter_expand_format(iter);
- pos = *(void **)pos;
- ret = strncpy_from_kernel_nofault(iter->fmt, pos,
+ addr = trace_adjust_address(tr, *(unsigned long *)pos);
+ ret = strncpy_from_kernel_nofault(iter->fmt, (void *)addr,
iter->fmt_size);
if (ret < 0)
trace_seq_printf(&iter->seq, "(0x%px)", pos);
@@ -888,8 +987,8 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c
pos, iter->fmt);
break;
case FILTER_TRACE_FN:
- pos = *(void **)pos;
- trace_seq_printf(&iter->seq, "%pS", pos);
+ addr = trace_adjust_address(tr, *(unsigned long *)pos);
+ trace_seq_printf(&iter->seq, "%pS", (void *)addr);
break;
case FILTER_CPU:
case FILTER_OTHER:
@@ -919,14 +1018,36 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c
break;
}
- trace_seq_printf(&iter->seq, "0x%x (%d)",
- *(unsigned int *)pos,
- *(unsigned int *)pos);
+ addr = *(unsigned int *)pos;
+
+ /* Some fields reference offset from _stext. */
+ if (!strcmp(field->name, "caller_offs") ||
+ !strcmp(field->name, "parent_offs")) {
+ unsigned long ip;
+
+ ip = addr + (unsigned long)_stext;
+ ip = trace_adjust_address(tr, ip);
+ trace_seq_printf(&iter->seq, "%pS ", (void *)ip);
+ }
+
+ if (sizeof(long) == 4) {
+ addr = trace_adjust_address(tr, addr);
+ trace_seq_printf(&iter->seq, "%pS (%d)",
+ (void *)addr, (int)addr);
+ } else {
+ trace_seq_printf(&iter->seq, "0x%x (%d)",
+ (unsigned int)addr, (int)addr);
+ }
break;
case 8:
- trace_seq_printf(&iter->seq, "0x%llx (%lld)",
- *(unsigned long long *)pos,
- *(unsigned long long *)pos);
+ laddr = *(unsigned long long *)pos;
+ if (sizeof(long) == 8) {
+ laddr = trace_adjust_address(tr, (unsigned long)laddr);
+ trace_seq_printf(&iter->seq, "%pS (%lld)",
+ (void *)(long)laddr, laddr);
+ } else {
+ trace_seq_printf(&iter->seq, "0x%llx (%lld)", laddr, laddr);
+ }
break;
default:
trace_seq_puts(&iter->seq, "<INVALID-SIZE>");
@@ -946,11 +1067,12 @@ enum print_line_t print_event_fields(struct trace_iterator *iter,
struct trace_event_call *call;
struct list_head *head;
+ lockdep_assert_held_read(&trace_event_sem);
+
/* ftrace defined events have separate call structures */
if (event->type <= __TRACE_LAST_TYPE) {
bool found = false;
- down_read(&trace_event_sem);
list_for_each_entry(call, &ftrace_events, list) {
if (call->event.type == event->type) {
found = true;
@@ -960,7 +1082,6 @@ enum print_line_t print_event_fields(struct trace_iterator *iter,
if (call->event.type > __TRACE_LAST_TYPE)
break;
}
- up_read(&trace_event_sem);
if (!found) {
trace_seq_printf(&iter->seq, "UNKNOWN TYPE %d\n", event->type);
goto out;
@@ -990,9 +1111,15 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
}
static void print_fn_trace(struct trace_seq *s, unsigned long ip,
- unsigned long parent_ip, int flags)
+ unsigned long parent_ip, unsigned long *args,
+ struct trace_array *tr, int flags)
{
+ ip = trace_adjust_address(tr, ip);
+ parent_ip = trace_adjust_address(tr, parent_ip);
+
seq_print_ip_sym(s, ip, flags);
+ if (args)
+ print_function_args(s, args, ip);
if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) {
trace_seq_puts(s, " <-");
@@ -1006,10 +1133,18 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
{
struct ftrace_entry *field;
struct trace_seq *s = &iter->seq;
+ unsigned long *args;
+ int args_size;
trace_assign_type(field, iter->ent);
- print_fn_trace(s, field->ip, field->parent_ip, flags);
+ args_size = iter->ent_size - offsetof(struct ftrace_entry, args);
+ if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long))
+ args = field->args;
+ else
+ args = NULL;
+
+ print_fn_trace(s, field->ip, field->parent_ip, args, iter->tr, flags);
trace_seq_putc(s, '\n');
return trace_handle_return(s);
@@ -1242,7 +1377,11 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
break;
trace_seq_puts(s, " => ");
- seq_print_ip_sym(s, *p, flags);
+ if ((*p) == FTRACE_TRAMPOLINE_MARKER) {
+ trace_seq_puts(s, "[FTRACE TRAMPOLINE]\n");
+ continue;
+ }
+ seq_print_ip_sym(s, trace_adjust_address(iter->tr, *p), flags);
trace_seq_putc(s, '\n');
}
@@ -1587,10 +1726,13 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
{
struct print_entry *field;
struct trace_seq *s = &iter->seq;
+ unsigned long ip;
trace_assign_type(field, iter->ent);
- seq_print_ip_sym(s, field->ip, flags);
+ ip = trace_adjust_address(iter->tr, field->ip);
+
+ seq_print_ip_sym(s, ip, flags);
trace_seq_printf(s, ": %s", field->buf);
return trace_handle_return(s);
@@ -1674,7 +1816,7 @@ trace_func_repeats_print(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- print_fn_trace(s, field->ip, field->parent_ip, flags);
+ print_fn_trace(s, field->ip, field->parent_ip, NULL, iter->tr, flags);
trace_seq_printf(s, " (repeats: %u, last_ts:", field->count);
trace_print_time(s, iter,
iter->ts - FUNC_REPEATS_GET_DELTA_TS(field));
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index dca40f1f1da4..2e305364f2a9 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -41,5 +41,14 @@ extern struct rw_semaphore trace_event_sem;
#define SEQ_PUT_HEX_FIELD(s, x) \
trace_seq_putmem_hex(s, &(x), sizeof(x))
+#ifdef CONFIG_FUNCTION_TRACE_ARGS
+void print_function_args(struct trace_seq *s, unsigned long *args,
+ unsigned long func);
+#else
+static inline void print_function_args(struct trace_seq *s, unsigned long *args,
+ unsigned long func) {
+ trace_seq_puts(s, "()");
+}
+#endif
#endif
diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index e37446f7916e..0c42b15c3800 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/ftrace.h>
#include <linux/kprobes.h>
+#include <linux/hardirq.h>
#include "trace.h"
#define CREATE_TRACE_POINTS
@@ -20,13 +21,29 @@
* tooling: these calls will only happen with RCU enabled, which can
* use a regular tracepoint.
*
- * On older architectures, use the rcuidle tracing methods (which
- * aren't NMI-safe - so exclude NMI contexts):
+ * On older architectures, RCU may not be watching in idle. In that
+ * case, wake up RCU to watch while calling the tracepoint. These
+ * aren't NMI-safe - so exclude NMI contexts:
*/
#ifdef CONFIG_ARCH_WANTS_NO_INSTR
-#define trace(point) trace_##point
+#define trace(point, args) trace_##point(args)
#else
-#define trace(point) if (!in_nmi()) trace_##point##_rcuidle
+#define trace(point, args) \
+ do { \
+ if (trace_##point##_enabled()) { \
+ bool exit_rcu = false; \
+ if (in_nmi()) \
+ break; \
+ if (!IS_ENABLED(CONFIG_TINY_RCU) && \
+ is_idle_task(current)) { \
+ ct_irq_enter(); \
+ exit_rcu = true; \
+ } \
+ trace_##point(args); \
+ if (exit_rcu) \
+ ct_irq_exit(); \
+ } \
+ } while (0)
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
@@ -42,7 +59,7 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu);
void trace_hardirqs_on_prepare(void)
{
if (this_cpu_read(tracing_irq_cpu)) {
- trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1);
+ trace(irq_enable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1));
tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
this_cpu_write(tracing_irq_cpu, 0);
}
@@ -53,7 +70,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_prepare);
void trace_hardirqs_on(void)
{
if (this_cpu_read(tracing_irq_cpu)) {
- trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1);
+ trace(irq_enable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1));
tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
this_cpu_write(tracing_irq_cpu, 0);
}
@@ -75,7 +92,7 @@ void trace_hardirqs_off_finish(void)
if (!this_cpu_read(tracing_irq_cpu)) {
this_cpu_write(tracing_irq_cpu, 1);
tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
- trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1);
+ trace(irq_disable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1));
}
}
@@ -89,7 +106,7 @@ void trace_hardirqs_off(void)
if (!this_cpu_read(tracing_irq_cpu)) {
this_cpu_write(tracing_irq_cpu, 1);
tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
- trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1);
+ trace(irq_disable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1));
}
}
EXPORT_SYMBOL(trace_hardirqs_off);
@@ -100,13 +117,13 @@ NOKPROBE_SYMBOL(trace_hardirqs_off);
void trace_preempt_on(unsigned long a0, unsigned long a1)
{
- trace(preempt_enable)(a0, a1);
+ trace(preempt_enable, TP_ARGS(a0, a1));
tracer_preempt_on(a0, a1);
}
void trace_preempt_off(unsigned long a0, unsigned long a1)
{
- trace(preempt_disable)(a0, a1);
+ trace(preempt_disable, TP_ARGS(a0, a1));
tracer_preempt_off(a0, a1);
}
#endif
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index dfe3ee6035ec..424751cdf31f 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -12,6 +12,7 @@
#define pr_fmt(fmt) "trace_probe: " fmt
#include <linux/bpf.h>
+#include <linux/fs.h>
#include "trace_btf.h"
#include "trace_probe.h"
@@ -153,9 +154,12 @@ fail:
}
static struct trace_probe_log trace_probe_log;
+extern struct mutex dyn_event_ops_mutex;
void trace_probe_log_init(const char *subsystem, int argc, const char **argv)
{
+ lockdep_assert_held(&dyn_event_ops_mutex);
+
trace_probe_log.subsystem = subsystem;
trace_probe_log.argc = argc;
trace_probe_log.argv = argv;
@@ -164,11 +168,15 @@ void trace_probe_log_init(const char *subsystem, int argc, const char **argv)
void trace_probe_log_clear(void)
{
+ lockdep_assert_held(&dyn_event_ops_mutex);
+
memset(&trace_probe_log, 0, sizeof(trace_probe_log));
}
void trace_probe_log_set_index(int index)
{
+ lockdep_assert_held(&dyn_event_ops_mutex);
+
trace_probe_log.index = index;
}
@@ -177,6 +185,8 @@ void __trace_probe_log_err(int offset, int err_type)
char *command, *p;
int i, len = 0, pos = 0;
+ lockdep_assert_held(&dyn_event_ops_mutex);
+
if (!trace_probe_log.argv)
return;
@@ -275,7 +285,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
}
trace_probe_log_err(offset, NO_EVENT_NAME);
return -EINVAL;
- } else if (len > MAX_EVENT_NAME_LEN) {
+ } else if (len >= MAX_EVENT_NAME_LEN) {
trace_probe_log_err(offset, EVENT_TOO_LONG);
return -EINVAL;
}
@@ -553,6 +563,10 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
anon_offs = 0;
field = btf_find_struct_member(ctx->btf, type, fieldname,
&anon_offs);
+ if (IS_ERR(field)) {
+ trace_probe_log_err(ctx->offset, BAD_BTF_TID);
+ return PTR_ERR(field);
+ }
if (!field) {
trace_probe_log_err(ctx->offset, NO_BTF_FIELD);
return -ENOENT;
@@ -765,6 +779,10 @@ static int check_prepare_btf_string_fetch(char *typename,
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+/*
+ * Add the entry code to store the 'argnum'th parameter and return the offset
+ * in the entry data buffer where the data will be stored.
+ */
static int __store_entry_arg(struct trace_probe *tp, int argnum)
{
struct probe_entry_arg *earg = tp->entry_arg;
@@ -788,6 +806,20 @@ static int __store_entry_arg(struct trace_probe *tp, int argnum)
tp->entry_arg = earg;
}
+ /*
+ * The entry code array is repeating the pair of
+ * [FETCH_OP_ARG(argnum)][FETCH_OP_ST_EDATA(offset of entry data buffer)]
+ * and the rest of entries are filled with [FETCH_OP_END].
+ *
+ * To reduce the redundant function parameter fetching, we scan the entry
+ * code array to find the FETCH_OP_ARG which already fetches the 'argnum'
+ * parameter. If it doesn't match, update 'offset' to find the last
+ * offset.
+ * If we find the FETCH_OP_END without matching FETCH_OP_ARG entry, we
+ * will save the entry with FETCH_OP_ARG and FETCH_OP_ST_EDATA, and
+ * return data offset so that caller can find the data offset in the entry
+ * data buffer.
+ */
offset = 0;
for (i = 0; i < earg->size - 1; i++) {
switch (earg->code[i].op) {
@@ -821,6 +853,16 @@ int traceprobe_get_entry_data_size(struct trace_probe *tp)
if (!earg)
return 0;
+ /*
+ * earg->code[] array has an operation sequence which is run in
+ * the entry handler.
+ * The sequence stopped by FETCH_OP_END and each data stored in
+ * the entry data buffer by FETCH_OP_ST_EDATA. The FETCH_OP_ST_EDATA
+ * stores the data at the data buffer + its offset, and all data are
+ * "unsigned long" size. The offset must be increased when a data is
+ * stored. Thus we need to find the last FETCH_OP_ST_EDATA in the
+ * code array.
+ */
for (i = 0; i < earg->size; i++) {
switch (earg->code[i].op) {
case FETCH_OP_END:
@@ -1180,8 +1222,6 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
return ret;
}
-#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
-
/* Bitfield type needs to be parsed into a fetch function */
static int __parse_bitfield_probe_arg(const char *bf,
const struct fetch_type *t,
@@ -1406,7 +1446,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
struct traceprobe_parse_context *ctx)
{
struct fetch_insn *code, *tmp = NULL;
- char *type, *arg;
+ char *type, *arg __free(kfree) = NULL;
int ret, len;
len = strlen(argv);
@@ -1423,22 +1463,16 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
return -ENOMEM;
parg->comm = kstrdup(arg, GFP_KERNEL);
- if (!parg->comm) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!parg->comm)
+ return -ENOMEM;
type = parse_probe_arg_type(arg, parg, ctx);
- if (IS_ERR(type)) {
- ret = PTR_ERR(type);
- goto out;
- }
+ if (IS_ERR(type))
+ return PTR_ERR(type);
code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL);
- if (!code) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!code)
+ return -ENOMEM;
code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;
ctx->last_type = NULL;
@@ -1466,7 +1500,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
parg->fmt = kmalloc(len, GFP_KERNEL);
if (!parg->fmt) {
ret = -ENOMEM;
- goto out;
+ goto fail;
}
snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype,
parg->count);
@@ -1494,8 +1528,6 @@ fail:
kfree(code->data);
}
kfree(tmp);
-out:
- kfree(arg);
return ret;
}
@@ -1665,7 +1697,7 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[],
{
const struct btf_param *params = NULL;
int i, j, n, used, ret, args_idx = -1;
- const char **new_argv = NULL;
+ const char **new_argv __free(kfree) = NULL;
ret = argv_has_var_arg(argc, argv, &args_idx, ctx);
if (ret < 0)
@@ -1704,7 +1736,7 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[],
ret = sprint_nth_btf_arg(n, "", buf + used,
bufsize - used, ctx);
if (ret < 0)
- goto error;
+ return ERR_PTR(ret);
new_argv[j++] = buf + used;
used += ret + 1;
@@ -1718,25 +1750,78 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[],
n = simple_strtoul(argv[i] + 4, &type, 10);
if (type && !(*type == ':' || *type == '\0')) {
trace_probe_log_err(0, BAD_VAR);
- ret = -ENOENT;
- goto error;
+ return ERR_PTR(-ENOENT);
}
/* Note: $argN starts from $arg1 */
ret = sprint_nth_btf_arg(n - 1, type, buf + used,
bufsize - used, ctx);
if (ret < 0)
- goto error;
+ return ERR_PTR(ret);
new_argv[j++] = buf + used;
used += ret + 1;
} else
new_argv[j++] = argv[i];
}
- return new_argv;
+ return_ptr(new_argv);
+}
-error:
- kfree(new_argv);
- return ERR_PTR(ret);
+/* @buf: *buf must be equal to NULL. Caller must to free *buf */
+int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf)
+{
+ int i, used, ret;
+ const int bufsize = MAX_DENTRY_ARGS_LEN;
+ char *tmpbuf __free(kfree) = NULL;
+
+ if (*buf)
+ return -EINVAL;
+
+ used = 0;
+ for (i = 0; i < argc; i++) {
+ char *tmp __free(kfree) = NULL;
+ char *equal;
+ size_t arg_len;
+
+ if (!glob_match("*:%p[dD]", argv[i]))
+ continue;
+
+ if (!tmpbuf) {
+ tmpbuf = kmalloc(bufsize, GFP_KERNEL);
+ if (!tmpbuf)
+ return -ENOMEM;
+ }
+
+ tmp = kstrdup(argv[i], GFP_KERNEL);
+ if (!tmp)
+ return -ENOMEM;
+
+ equal = strchr(tmp, '=');
+ if (equal)
+ *equal = '\0';
+ arg_len = strlen(argv[i]);
+ tmp[arg_len - 4] = '\0';
+ if (argv[i][arg_len - 1] == 'd')
+ ret = snprintf(tmpbuf + used, bufsize - used,
+ "%s%s+0x0(+0x%zx(%s)):string",
+ equal ? tmp : "", equal ? "=" : "",
+ offsetof(struct dentry, d_name.name),
+ equal ? equal + 1 : tmp);
+ else
+ ret = snprintf(tmpbuf + used, bufsize - used,
+ "%s%s+0x0(+0x%zx(+0x%zx(%s))):string",
+ equal ? tmp : "", equal ? "=" : "",
+ offsetof(struct dentry, d_name.name),
+ offsetof(struct file, f_path.dentry),
+ equal ? equal + 1 : tmp);
+
+ if (ret >= bufsize - used)
+ return -ENOMEM;
+ argv[i] = tmpbuf + used;
+ used += ret + 1;
+ }
+
+ *buf = no_free_ptr(tmpbuf);
+ return 0;
}
void traceprobe_finish_parse(struct traceprobe_parse_context *ctx)
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index cef3a50628a3..854e5668f5ee 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -34,8 +34,8 @@
#define MAX_ARRAY_LEN 64
#define MAX_ARG_NAME_LEN 32
#define MAX_BTF_ARGS_LEN 128
+#define MAX_DENTRY_ARGS_LEN 256
#define MAX_STRING_SIZE PATH_MAX
-#define MAX_ARG_BUF_LEN (MAX_TRACE_ARGS * MAX_ARG_NAME_LEN)
/* Reserved field names */
#define FIELD_STRING_IP "__probe_ip"
@@ -428,6 +428,7 @@ extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i,
const char **traceprobe_expand_meta_args(int argc, const char *argv[],
int *new_argc, char *buf, int bufsize,
struct traceprobe_parse_context *ctx);
+extern int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf);
extern int traceprobe_update_arg(struct probe_arg *arg);
extern void traceprobe_free_probe_arg(struct probe_arg *arg);
@@ -479,6 +480,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(NON_UNIQ_SYMBOL, "The symbol is not unique"), \
C(BAD_RETPROBE, "Retprobe address must be an function entry"), \
C(NO_TRACEPOINT, "Tracepoint is not found"), \
+ C(BAD_TP_NAME, "Invalid character in tracepoint name"),\
C(BAD_ADDR_SUFFIX, "Invalid probed address suffix"), \
C(NO_GROUP_NAME, "Group name is not specified"), \
C(GROUP_TOO_LONG, "Group name is too long"), \
@@ -542,7 +544,9 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(NO_BTF_FIELD, "This field is not found."), \
C(BAD_BTF_TID, "Failed to get BTF type info."),\
C(BAD_TYPE4STR, "This type does not fit for string."),\
- C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),
+ C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),\
+ C(TOO_MANY_ARGS, "Too many arguments are specified"), \
+ C(TOO_MANY_EARGS, "Too many entry arguments specified"),
#undef C
#define C(a, b) TP_ERR_##a
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index 2caf0d2afb32..f39b37fcdb3b 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -232,7 +232,7 @@ array:
/* Sum up total data length for dynamic arrays (strings) */
static nokprobe_inline int
-__get_data_size(struct trace_probe *tp, struct pt_regs *regs, void *edata)
+__get_data_size(struct trace_probe *tp, void *regs, void *edata)
{
struct probe_arg *arg;
int i, len, ret = 0;
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 8a407adb0e1c..cb49f7279dc8 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -187,7 +187,7 @@ static inline char *get_saved_cmdlines(int idx)
static inline void set_cmdline(int idx, const char *cmdline)
{
- strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
+ strscpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
}
static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
@@ -442,7 +442,7 @@ int trace_alloc_tgid_map(void)
if (tgid_map)
return 0;
- tgid_map_max = pid_max;
+ tgid_map_max = init_pid_ns.pid_max;
map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
GFP_KERNEL);
if (!map)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0469a04a355f..bf1cb80742ae 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -83,14 +83,14 @@ func_prolog_preempt_disable(struct trace_array *tr,
goto out_enable;
*data = per_cpu_ptr(tr->array_buffer.data, cpu);
- disabled = atomic_inc_return(&(*data)->disabled);
+ disabled = local_inc_return(&(*data)->disabled);
if (unlikely(disabled != 1))
goto out;
return 1;
out:
- atomic_dec(&(*data)->disabled);
+ local_dec(&(*data)->disabled);
out_enable:
preempt_enable_notrace();
@@ -112,14 +112,17 @@ static int wakeup_display_graph(struct trace_array *tr, int set)
return start_func_tracer(tr, set);
}
-static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
unsigned int trace_ctx;
+ u64 *calltime;
int ret = 0;
- if (ftrace_graph_ignore_func(trace))
+ if (ftrace_graph_ignore_func(gops, trace))
return 0;
/*
* Do not trace a function if it's filtered by set_graph_notrace.
@@ -134,26 +137,43 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
return 0;
+ calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime));
+ if (!calltime)
+ return 0;
+
+ *calltime = trace_clock_local();
+
ret = __trace_graph_entry(tr, trace, trace_ctx);
- atomic_dec(&data->disabled);
+ local_dec(&data->disabled);
preempt_enable_notrace();
return ret;
}
-static void wakeup_graph_return(struct ftrace_graph_ret *trace)
+static void wakeup_graph_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
unsigned int trace_ctx;
+ u64 *calltime;
+ u64 rettime;
+ int size;
- ftrace_graph_addr_finish(trace);
+ ftrace_graph_addr_finish(gops, trace);
if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
return;
- __trace_graph_return(tr, trace, trace_ctx);
- atomic_dec(&data->disabled);
+ rettime = trace_clock_local();
+
+ calltime = fgraph_retrieve_data(gops->idx, &size);
+ if (!calltime)
+ return;
+
+ __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime);
+ local_dec(&data->disabled);
preempt_enable_notrace();
return;
@@ -168,8 +188,6 @@ static void wakeup_trace_open(struct trace_iterator *iter)
{
if (is_graph(iter->tr))
graph_trace_open(iter);
- else
- iter->private = NULL;
}
static void wakeup_trace_close(struct trace_iterator *iter)
@@ -222,10 +240,10 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
return;
local_irq_save(flags);
- trace_function(tr, ip, parent_ip, trace_ctx);
+ trace_function(tr, ip, parent_ip, trace_ctx, fregs);
local_irq_restore(flags);
- atomic_dec(&data->disabled);
+ local_dec(&data->disabled);
preempt_enable_notrace();
}
@@ -307,7 +325,7 @@ __trace_function(struct trace_array *tr,
if (is_graph(tr))
trace_graph_function(tr, ip, parent_ip, trace_ctx);
else
- trace_function(tr, ip, parent_ip, trace_ctx);
+ trace_function(tr, ip, parent_ip, trace_ctx, NULL);
}
static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
@@ -376,7 +394,6 @@ tracing_sched_switch_trace(struct trace_array *tr,
struct task_struct *next,
unsigned int trace_ctx)
{
- struct trace_event_call *call = &event_context_switch;
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct ctx_switch_entry *entry;
@@ -394,8 +411,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
entry->next_state = task_state_index(next);
entry->next_cpu = task_cpu(next);
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
+ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
}
static void
@@ -404,7 +420,6 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
struct task_struct *curr,
unsigned int trace_ctx)
{
- struct trace_event_call *call = &event_wakeup;
struct ring_buffer_event *event;
struct ctx_switch_entry *entry;
struct trace_buffer *buffer = tr->array_buffer.buffer;
@@ -422,8 +437,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
entry->next_state = task_state_index(wakee);
entry->next_cpu = task_cpu(wakee);
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
+ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx);
}
static void notrace
@@ -457,7 +471,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
/* disable local data, not wakeup_cpu data */
cpu = raw_smp_processor_id();
- disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
+ disabled = local_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
if (likely(disabled != 1))
goto out;
@@ -494,7 +508,7 @@ out_unlock:
arch_spin_unlock(&wakeup_lock);
local_irq_restore(flags);
out:
- atomic_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
+ local_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
}
static void __wakeup_reset(struct trace_array *tr)
@@ -545,11 +559,11 @@ probe_wakeup(void *ignore, struct task_struct *p)
* - wakeup_dl handles tasks belonging to sched_dl class only.
*/
if (tracing_dl || (wakeup_dl && !dl_task(p)) ||
- (wakeup_rt && !dl_task(p) && !rt_task(p)) ||
+ (wakeup_rt && !rt_or_dl_task(p)) ||
(!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio)))
return;
- disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
+ disabled = local_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
if (unlikely(disabled != 1))
goto out;
@@ -596,7 +610,7 @@ probe_wakeup(void *ignore, struct task_struct *p)
out_locked:
arch_spin_unlock(&wakeup_lock);
out:
- atomic_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
+ local_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
}
static void start_wakeup_tracer(struct trace_array *tr)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index e9c5058a8efd..d88c44f1dfa5 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
case TRACE_PRINT:
case TRACE_BRANCH:
case TRACE_GRAPH_ENT:
+ case TRACE_GRAPH_RETADDR_ENT:
case TRACE_GRAPH_RET:
return 1;
}
@@ -756,13 +757,278 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+#define CHAR_NUMBER 123
+#define SHORT_NUMBER 12345
+#define WORD_NUMBER 1234567890
+#define LONG_NUMBER 1234567890123456789LL
+#define ERRSTR_BUFLEN 128
+
+struct fgraph_fixture {
+ struct fgraph_ops gops;
+ int store_size;
+ const char *store_type_name;
+ char error_str_buf[ERRSTR_BUFLEN];
+ char *error_str;
+};
+
+static __init int store_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
+{
+ struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops);
+ const char *type = fixture->store_type_name;
+ int size = fixture->store_size;
+ void *p;
+
+ p = fgraph_reserve_data(gops->idx, size);
+ if (!p) {
+ snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
+ "Failed to reserve %s\n", type);
+ return 0;
+ }
+
+ switch (size) {
+ case 1:
+ *(char *)p = CHAR_NUMBER;
+ break;
+ case 2:
+ *(short *)p = SHORT_NUMBER;
+ break;
+ case 4:
+ *(int *)p = WORD_NUMBER;
+ break;
+ case 8:
+ *(long long *)p = LONG_NUMBER;
+ break;
+ }
+
+ return 1;
+}
+
+static __init void store_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
+{
+ struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops);
+ const char *type = fixture->store_type_name;
+ long long expect = 0;
+ long long found = -1;
+ int size;
+ char *p;
+
+ p = fgraph_retrieve_data(gops->idx, &size);
+ if (!p) {
+ snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
+ "Failed to retrieve %s\n", type);
+ return;
+ }
+ if (fixture->store_size > size) {
+ snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
+ "Retrieved size %d is smaller than expected %d\n",
+ size, (int)fixture->store_size);
+ return;
+ }
+
+ switch (fixture->store_size) {
+ case 1:
+ expect = CHAR_NUMBER;
+ found = *(char *)p;
+ break;
+ case 2:
+ expect = SHORT_NUMBER;
+ found = *(short *)p;
+ break;
+ case 4:
+ expect = WORD_NUMBER;
+ found = *(int *)p;
+ break;
+ case 8:
+ expect = LONG_NUMBER;
+ found = *(long long *)p;
+ break;
+ }
+
+ if (found != expect) {
+ snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
+ "%s returned not %lld but %lld\n", type, expect, found);
+ return;
+ }
+ fixture->error_str = NULL;
+}
+
+static int __init init_fgraph_fixture(struct fgraph_fixture *fixture)
+{
+ char *func_name;
+ int len;
+
+ snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
+ "Failed to execute storage %s\n", fixture->store_type_name);
+ fixture->error_str = fixture->error_str_buf;
+
+ func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
+ len = strlen(func_name);
+
+ return ftrace_set_filter(&fixture->gops.ops, func_name, len, 1);
+}
+
+/* Test fgraph storage for each size */
+static int __init test_graph_storage_single(struct fgraph_fixture *fixture)
+{
+ int size = fixture->store_size;
+ int ret;
+
+ pr_cont("PASSED\n");
+ pr_info("Testing fgraph storage of %d byte%s: ", size, str_plural(size));
+
+ ret = init_fgraph_fixture(fixture);
+ if (ret && ret != -ENODEV) {
+ pr_cont("*Could not set filter* ");
+ return -1;
+ }
+
+ ret = register_ftrace_graph(&fixture->gops);
+ if (ret) {
+ pr_warn("Failed to init store_bytes fgraph tracing\n");
+ return -1;
+ }
+
+ DYN_FTRACE_TEST_NAME();
+
+ unregister_ftrace_graph(&fixture->gops);
+
+ if (fixture->error_str) {
+ pr_cont("*** %s ***", fixture->error_str);
+ return -1;
+ }
+
+ return 0;
+}
+
+static struct fgraph_fixture store_bytes[4] __initdata = {
+ [0] = {
+ .gops = {
+ .entryfunc = store_entry,
+ .retfunc = store_return,
+ },
+ .store_size = 1,
+ .store_type_name = "byte",
+ },
+ [1] = {
+ .gops = {
+ .entryfunc = store_entry,
+ .retfunc = store_return,
+ },
+ .store_size = 2,
+ .store_type_name = "short",
+ },
+ [2] = {
+ .gops = {
+ .entryfunc = store_entry,
+ .retfunc = store_return,
+ },
+ .store_size = 4,
+ .store_type_name = "word",
+ },
+ [3] = {
+ .gops = {
+ .entryfunc = store_entry,
+ .retfunc = store_return,
+ },
+ .store_size = 8,
+ .store_type_name = "long long",
+ },
+};
+
+static __init int test_graph_storage_multi(void)
+{
+ struct fgraph_fixture *fixture;
+ bool printed = false;
+ int i, j, ret;
+
+ pr_cont("PASSED\n");
+ pr_info("Testing multiple fgraph storage on a function: ");
+
+ for (i = 0; i < ARRAY_SIZE(store_bytes); i++) {
+ fixture = &store_bytes[i];
+ ret = init_fgraph_fixture(fixture);
+ if (ret && ret != -ENODEV) {
+ pr_cont("*Could not set filter* ");
+ printed = true;
+ goto out2;
+ }
+ }
+
+ for (j = 0; j < ARRAY_SIZE(store_bytes); j++) {
+ fixture = &store_bytes[j];
+ ret = register_ftrace_graph(&fixture->gops);
+ if (ret) {
+ pr_warn("Failed to init store_bytes fgraph tracing\n");
+ printed = true;
+ goto out1;
+ }
+ }
+
+ DYN_FTRACE_TEST_NAME();
+out1:
+ while (--j >= 0) {
+ fixture = &store_bytes[j];
+ unregister_ftrace_graph(&fixture->gops);
+
+ if (fixture->error_str && !printed) {
+ pr_cont("*** %s ***", fixture->error_str);
+ printed = true;
+ }
+ }
+out2:
+ while (--i >= 0) {
+ fixture = &store_bytes[i];
+ ftrace_free_filter(&fixture->gops.ops);
+
+ if (fixture->error_str && !printed) {
+ pr_cont("*** %s ***", fixture->error_str);
+ printed = true;
+ }
+ }
+ return printed ? -1 : 0;
+}
+
+/* Test the storage passed across function_graph entry and return */
+static __init int test_graph_storage(void)
+{
+ int ret;
+
+ ret = test_graph_storage_single(&store_bytes[0]);
+ if (ret)
+ return ret;
+ ret = test_graph_storage_single(&store_bytes[1]);
+ if (ret)
+ return ret;
+ ret = test_graph_storage_single(&store_bytes[2]);
+ if (ret)
+ return ret;
+ ret = test_graph_storage_single(&store_bytes[3]);
+ if (ret)
+ return ret;
+ ret = test_graph_storage_multi();
+ if (ret)
+ return ret;
+ return 0;
+}
+#else
+static inline int test_graph_storage(void) { return 0; }
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
/* Maximum number of functions to trace before diagnosing a hang */
#define GRAPH_MAX_FUNC_TEST 100000000
static unsigned int graph_hang_thresh;
/* Wrap the real function entry probe to avoid possible hanging */
-static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
+static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
/* This is harmlessly racy, we want to approximately detect a hang */
if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
@@ -776,7 +1042,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
return 0;
}
- return trace_graph_entry(trace);
+ return trace_graph_entry(trace, gops, fregs);
}
static struct fgraph_ops fgraph_ops __initdata = {
@@ -812,7 +1078,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
* to detect and recover from possible hangs
*/
tracing_reset_online_cpus(&tr->array_buffer);
- set_graph_array(tr);
+ fgraph_ops.private = tr;
ret = register_ftrace_graph(&fgraph_ops);
if (ret) {
warn_failed_init_tracer(trace, ret);
@@ -855,7 +1121,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
cond_resched();
tracing_reset_online_cpus(&tr->array_buffer);
- set_graph_array(tr);
+ fgraph_ops.private = tr;
/*
* Some archs *cough*PowerPC*cough* add characters to the
@@ -912,6 +1178,8 @@ trace_selftest_startup_function_graph(struct tracer *trace,
ftrace_set_global_filter(NULL, 0, 1);
#endif
+ ret = test_graph_storage();
+
/* Don't test dynamic tracing, the function tracer already did */
out:
/* Stop it if we failed */
@@ -1221,7 +1489,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
/* reset the max latency */
tr->max_latency = 0;
- while (p->on_rq) {
+ while (task_is_runnable(p)) {
/*
* Sleep to make sure the -deadline thread is asleep too.
* On virtual machines we can't rely on timings,
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 5a48dba912ea..0aa2514a6593 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -32,7 +32,7 @@ static arch_spinlock_t stack_trace_max_lock =
DEFINE_PER_CPU(int, disable_stack_tracer);
static DEFINE_MUTEX(stack_sysctl_mutex);
-int stack_tracer_enabled;
+static int stack_tracer_enabled;
static void print_max_stack(void)
{
@@ -514,26 +514,24 @@ static const struct file_operations stack_trace_filter_fops = {
#endif /* CONFIG_DYNAMIC_FTRACE */
int
-stack_trace_sysctl(struct ctl_table *table, int write, void *buffer,
+stack_trace_sysctl(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int was_enabled;
int ret;
- mutex_lock(&stack_sysctl_mutex);
+ guard(mutex)(&stack_sysctl_mutex);
was_enabled = !!stack_tracer_enabled;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret || !write || (was_enabled == !!stack_tracer_enabled))
- goto out;
+ return ret;
if (stack_tracer_enabled)
register_ftrace_function(&trace_ops);
else
unregister_ftrace_function(&trace_ops);
- out:
- mutex_unlock(&stack_sysctl_mutex);
return ret;
}
@@ -544,7 +542,7 @@ static __init int enable_stacktrace(char *str)
int len;
if ((len = str_has_prefix(str, "_filter=")))
- strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE);
+ strscpy(stack_trace_filter_buf, str + len);
stack_tracer_enabled = 1;
return 1;
@@ -580,3 +578,23 @@ static __init int stack_trace_init(void)
}
device_initcall(stack_trace_init);
+
+
+static const struct ctl_table trace_stack_sysctl_table[] = {
+ {
+ .procname = "stack_tracer_enabled",
+ .data = &stack_tracer_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = stack_trace_sysctl,
+ },
+};
+
+static int __init init_trace_stack_sysctls(void)
+{
+ register_sysctl_init("kernel", trace_stack_sysctl_table);
+ return 0;
+}
+subsys_initcall(init_trace_stack_sysctls);
+
+
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index bb247beec447..b3b5586f104d 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -128,7 +128,7 @@ static int stat_seq_init(struct stat_session *session)
int ret = 0;
int i;
- mutex_lock(&session->stat_mutex);
+ guard(mutex)(&session->stat_mutex);
__reset_stat_session(session);
if (!ts->stat_cmp)
@@ -136,11 +136,11 @@ static int stat_seq_init(struct stat_session *session)
stat = ts->stat_start(ts);
if (!stat)
- goto exit;
+ return 0;
ret = insert_stat(root, stat, ts->stat_cmp);
if (ret)
- goto exit;
+ return ret;
/*
* Iterate over the tracer stat entries and store them in an rbtree.
@@ -157,13 +157,10 @@ static int stat_seq_init(struct stat_session *session)
goto exit_free_rbtree;
}
-exit:
- mutex_unlock(&session->stat_mutex);
return ret;
exit_free_rbtree:
__reset_stat_session(session);
- mutex_unlock(&session->stat_mutex);
return ret;
}
@@ -308,7 +305,7 @@ static int init_stat_file(struct stat_session *session)
int register_stat_tracer(struct tracer_stat *trace)
{
struct stat_session *session, *node;
- int ret = -EINVAL;
+ int ret;
if (!trace)
return -EINVAL;
@@ -316,18 +313,18 @@ int register_stat_tracer(struct tracer_stat *trace)
if (!trace->stat_start || !trace->stat_next || !trace->stat_show)
return -EINVAL;
+ guard(mutex)(&all_stat_sessions_mutex);
+
/* Already registered? */
- mutex_lock(&all_stat_sessions_mutex);
list_for_each_entry(node, &all_stat_sessions, session_list) {
if (node->ts == trace)
- goto out;
+ return -EINVAL;
}
- ret = -ENOMEM;
/* Init the session */
session = kzalloc(sizeof(*session), GFP_KERNEL);
if (!session)
- goto out;
+ return -ENOMEM;
session->ts = trace;
INIT_LIST_HEAD(&session->session_list);
@@ -336,16 +333,13 @@ int register_stat_tracer(struct tracer_stat *trace)
ret = init_stat_file(session);
if (ret) {
destroy_session(session);
- goto out;
+ return ret;
}
- ret = 0;
/* Register */
list_add_tail(&session->session_list, &all_stat_sessions);
- out:
- mutex_unlock(&all_stat_sessions_mutex);
- return ret;
+ return 0;
}
void unregister_stat_tracer(struct tracer_stat *trace)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 9c581d6da843..46aab0ab9350 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -299,6 +299,13 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
int syscall_nr;
int size;
+ /*
+ * Syscall probe called with preemption enabled, but the ring
+ * buffer and per-cpu data require preemption to be disabled.
+ */
+ might_fault();
+ guard(preempt_notrace)();
+
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
@@ -338,6 +345,13 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
struct trace_event_buffer fbuffer;
int syscall_nr;
+ /*
+ * Syscall probe called with preemption enabled, but the ring
+ * buffer and per-cpu data require preemption to be disabled.
+ */
+ might_fault();
+ guard(preempt_notrace)();
+
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
@@ -564,6 +578,7 @@ static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *re
BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
+ perf_fetch_caller_regs(regs);
*(struct pt_regs **)&param = regs;
param.syscall_nr = rec->nr;
for (i = 0; i < sys_data->nb_args; i++)
@@ -575,6 +590,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
+ struct pt_regs *fake_regs;
struct hlist_head *head;
unsigned long args[6];
bool valid_prog_array;
@@ -582,6 +598,13 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
int rctx;
int size;
+ /*
+ * Syscall probe called with preemption enabled, but the ring
+ * buffer and per-cpu data require preemption to be disabled.
+ */
+ might_fault();
+ guard(preempt_notrace)();
+
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
@@ -602,7 +625,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
size = ALIGN(size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- rec = perf_trace_buf_alloc(size, NULL, &rctx);
+ rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
if (!rec)
return;
@@ -611,7 +634,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
if ((valid_prog_array &&
- !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) ||
+ !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
@@ -666,6 +689,7 @@ static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *reg
} __aligned(8) param;
/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
+ perf_fetch_caller_regs(regs);
*(struct pt_regs **)&param = regs;
param.syscall_nr = rec->nr;
param.ret = rec->ret;
@@ -676,12 +700,20 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
+ struct pt_regs *fake_regs;
struct hlist_head *head;
bool valid_prog_array;
int syscall_nr;
int rctx;
int size;
+ /*
+ * Syscall probe called with preemption enabled, but the ring
+ * buffer and per-cpu data require preemption to be disabled.
+ */
+ might_fault();
+ guard(preempt_notrace)();
+
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
@@ -701,7 +733,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- rec = perf_trace_buf_alloc(size, NULL, &rctx);
+ rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
if (!rec)
return;
@@ -709,7 +741,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
rec->ret = syscall_get_return_value(current, regs);
if ((valid_prog_array &&
- !perf_call_bpf_exit(sys_data->exit_event, regs, rec)) ||
+ !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) ||
hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 9e461362450a..f95a2c3d5b1b 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -17,6 +17,7 @@
#include <linux/string.h>
#include <linux/rculist.h>
#include <linux/filter.h>
+#include <linux/percpu.h>
#include "trace_dynevent.h"
#include "trace_probe.h"
@@ -58,11 +59,11 @@ struct trace_uprobe {
struct dyn_event devent;
struct uprobe_consumer consumer;
struct path path;
- struct inode *inode;
char *filename;
+ struct uprobe *uprobe;
unsigned long offset;
unsigned long ref_ctr_offset;
- unsigned long nhit;
+ unsigned long __percpu *nhits;
struct trace_probe tp;
};
@@ -88,9 +89,11 @@ static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev)
static int register_uprobe_event(struct trace_uprobe *tu);
static int unregister_uprobe_event(struct trace_uprobe *tu);
-static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
+static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs,
+ __u64 *data);
static int uretprobe_dispatcher(struct uprobe_consumer *con,
- unsigned long func, struct pt_regs *regs);
+ unsigned long func, struct pt_regs *regs,
+ __u64 *data);
#ifdef CONFIG_STACK_GROWSUP
static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n)
@@ -337,6 +340,12 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
if (!tu)
return ERR_PTR(-ENOMEM);
+ tu->nhits = alloc_percpu(unsigned long);
+ if (!tu->nhits) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
ret = trace_probe_init(&tu->tp, event, group, true, nargs);
if (ret < 0)
goto error;
@@ -349,6 +358,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
return tu;
error:
+ free_percpu(tu->nhits);
kfree(tu);
return ERR_PTR(ret);
@@ -362,6 +372,7 @@ static void free_trace_uprobe(struct trace_uprobe *tu)
path_put(&tu->path);
trace_probe_cleanup(&tu->tp);
kfree(tu->filename);
+ free_percpu(tu->nhits);
kfree(tu);
}
@@ -487,11 +498,11 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
struct trace_uprobe *old_tu;
int ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
ret = validate_ref_ctr_offset(tu);
if (ret)
- goto end;
+ return ret;
/* register as an event */
old_tu = find_probe_event(trace_probe_name(&tu->tp),
@@ -500,11 +511,9 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
if (is_ret_probe(tu) != is_ret_probe(old_tu)) {
trace_probe_log_set_index(0);
trace_probe_log_err(0, DIFF_PROBE_TYPE);
- ret = -EEXIST;
- } else {
- ret = append_trace_uprobe(tu, old_tu);
+ return -EEXIST;
}
- goto end;
+ return append_trace_uprobe(tu, old_tu);
}
ret = register_uprobe_event(tu);
@@ -514,14 +523,11 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
trace_probe_log_err(0, EVENT_EXIST);
} else
pr_warn("Failed to register probe event(%d)\n", ret);
- goto end;
+ return ret;
}
dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp));
-end:
- mutex_unlock(&event_mutex);
-
return ret;
}
@@ -557,6 +563,14 @@ static int __trace_uprobe_create(int argc, const char **argv)
if (argc < 2)
return -ECANCELED;
+ trace_probe_log_init("trace_uprobe", argc, argv);
+
+ if (argc - 2 > MAX_TRACE_ARGS) {
+ trace_probe_log_set_index(2);
+ trace_probe_log_err(0, TOO_MANY_ARGS);
+ return -E2BIG;
+ }
+
if (argv[0][1] == ':')
event = &argv[0][2];
@@ -574,7 +588,6 @@ static int __trace_uprobe_create(int argc, const char **argv)
return -ECANCELED;
}
- trace_probe_log_init("trace_uprobe", argc, argv);
trace_probe_log_set_index(1); /* filename is the 2nd argument */
*arg++ = '\0';
@@ -681,7 +694,7 @@ static int __trace_uprobe_create(int argc, const char **argv)
tu->filename = filename;
/* parse arguments */
- for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+ for (i = 0; i < argc; i++) {
struct traceprobe_parse_context ctx = {
.flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER,
};
@@ -728,7 +741,7 @@ static int create_or_delete_trace_uprobe(const char *raw_command)
if (raw_command[0] == '-')
return dyn_event_release(raw_command, &trace_uprobe_ops);
- ret = trace_uprobe_create(raw_command);
+ ret = dyn_event_create(raw_command, &trace_uprobe_ops);
return ret == -ECANCELED ? -EINVAL : ret;
}
@@ -815,13 +828,21 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
{
struct dyn_event *ev = v;
struct trace_uprobe *tu;
+ unsigned long nhits;
+ int cpu;
if (!is_trace_uprobe(ev))
return 0;
tu = to_trace_uprobe(ev);
+
+ nhits = 0;
+ for_each_possible_cpu(cpu) {
+ nhits += per_cpu(*tu->nhits, cpu);
+ }
+
seq_printf(m, " %s %-44s %15lu\n", tu->filename,
- trace_probe_name(&tu->tp), tu->nhit);
+ trace_probe_name(&tu->tp), nhits);
return 0;
}
@@ -854,9 +875,11 @@ static const struct file_operations uprobe_profile_ops = {
struct uprobe_cpu_buffer {
struct mutex mutex;
void *buf;
+ int dsize;
};
static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer;
static int uprobe_buffer_refcnt;
+#define MAX_UCB_BUFFER_SIZE PAGE_SIZE
static int uprobe_buffer_init(void)
{
@@ -940,12 +963,41 @@ static struct uprobe_cpu_buffer *uprobe_buffer_get(void)
static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb)
{
+ if (!ucb)
+ return;
mutex_unlock(&ucb->mutex);
}
+static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu,
+ struct pt_regs *regs,
+ struct uprobe_cpu_buffer **ucbp)
+{
+ struct uprobe_cpu_buffer *ucb;
+ int dsize, esize;
+
+ if (*ucbp)
+ return *ucbp;
+
+ esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
+ dsize = __get_data_size(&tu->tp, regs, NULL);
+
+ ucb = uprobe_buffer_get();
+ ucb->dsize = tu->tp.size + dsize;
+
+ if (WARN_ON_ONCE(ucb->dsize > MAX_UCB_BUFFER_SIZE)) {
+ ucb->dsize = MAX_UCB_BUFFER_SIZE;
+ dsize = MAX_UCB_BUFFER_SIZE - tu->tp.size;
+ }
+
+ store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize);
+
+ *ucbp = ucb;
+ return ucb;
+}
+
static void __uprobe_trace_func(struct trace_uprobe *tu,
unsigned long func, struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb, int dsize,
+ struct uprobe_cpu_buffer *ucb,
struct trace_event_file *trace_file)
{
struct uprobe_trace_entry_head *entry;
@@ -956,14 +1008,11 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
WARN_ON(call != trace_file->event_call);
- if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE))
- return;
-
if (trace_trigger_soft_disabled(trace_file))
return;
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
- size = esize + tu->tp.size + dsize;
+ size = esize + ucb->dsize;
entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
if (!entry)
return;
@@ -977,23 +1026,26 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
data = DATAOF_TRACE_ENTRY(entry, false);
}
- memcpy(data, ucb->buf, tu->tp.size + dsize);
+ memcpy(data, ucb->buf, ucb->dsize);
trace_event_buffer_commit(&fbuffer);
}
/* uprobe handler */
static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb, int dsize)
+ struct uprobe_cpu_buffer **ucbp)
{
struct event_file_link *link;
+ struct uprobe_cpu_buffer *ucb;
if (is_ret_probe(tu))
return 0;
+ ucb = prepare_uprobe_buffer(tu, regs, ucbp);
+
rcu_read_lock();
trace_probe_for_each_link_rcu(link, &tu->tp)
- __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file);
+ __uprobe_trace_func(tu, 0, regs, ucb, link->file);
rcu_read_unlock();
return 0;
@@ -1001,13 +1053,16 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb, int dsize)
+ struct uprobe_cpu_buffer **ucbp)
{
struct event_file_link *link;
+ struct uprobe_cpu_buffer *ucb;
+
+ ucb = prepare_uprobe_buffer(tu, regs, ucbp);
rcu_read_lock();
trace_probe_for_each_link_rcu(link, &tu->tp)
- __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file);
+ __uprobe_trace_func(tu, func, regs, ucb, link->file);
rcu_read_unlock();
}
@@ -1047,43 +1102,40 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
return trace_handle_return(s);
}
-typedef bool (*filter_func_t)(struct uprobe_consumer *self,
- enum uprobe_filter_ctx ctx,
- struct mm_struct *mm);
+typedef bool (*filter_func_t)(struct uprobe_consumer *self, struct mm_struct *mm);
static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter)
{
- int ret;
+ struct inode *inode = d_real_inode(tu->path.dentry);
+ struct uprobe *uprobe;
tu->consumer.filter = filter;
- tu->inode = d_real_inode(tu->path.dentry);
-
- if (tu->ref_ctr_offset)
- ret = uprobe_register_refctr(tu->inode, tu->offset,
- tu->ref_ctr_offset, &tu->consumer);
- else
- ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
+ uprobe = uprobe_register(inode, tu->offset, tu->ref_ctr_offset, &tu->consumer);
+ if (IS_ERR(uprobe))
+ return PTR_ERR(uprobe);
- if (ret)
- tu->inode = NULL;
-
- return ret;
+ tu->uprobe = uprobe;
+ return 0;
}
static void __probe_event_disable(struct trace_probe *tp)
{
struct trace_uprobe *tu;
+ bool sync = false;
tu = container_of(tp, struct trace_uprobe, tp);
WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter));
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
- if (!tu->inode)
+ if (!tu->uprobe)
continue;
- uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
- tu->inode = NULL;
+ uprobe_unregister_nosync(tu->uprobe, &tu->consumer);
+ sync = true;
+ tu->uprobe = NULL;
}
+ if (sync)
+ uprobe_unregister_sync();
}
static int probe_event_enable(struct trace_event_call *call,
@@ -1199,9 +1251,6 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
{
struct perf_event *event;
- if (filter->nr_systemwide)
- return true;
-
list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
if (event->hw.target->mm == mm)
return true;
@@ -1282,7 +1331,7 @@ static int uprobe_perf_close(struct trace_event_call *call,
return 0;
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
- ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
+ ret = uprobe_apply(tu->uprobe, &tu->consumer, false);
if (ret)
break;
}
@@ -1306,7 +1355,7 @@ static int uprobe_perf_open(struct trace_event_call *call,
return 0;
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
- err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
+ err = uprobe_apply(tu->uprobe, &tu->consumer, true);
if (err) {
uprobe_perf_close(call, event);
break;
@@ -1316,8 +1365,7 @@ static int uprobe_perf_open(struct trace_event_call *call,
return err;
}
-static bool uprobe_perf_filter(struct uprobe_consumer *uc,
- enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+static bool uprobe_perf_filter(struct uprobe_consumer *uc, struct mm_struct *mm)
{
struct trace_uprobe_filter *filter;
struct trace_uprobe *tu;
@@ -1326,6 +1374,13 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
tu = container_of(uc, struct trace_uprobe, consumer);
filter = tu->tp.event->filter;
+ /*
+ * speculative short-circuiting check to avoid unnecessarily taking
+ * filter->rwlock below, if the uprobe has system-wide consumer
+ */
+ if (READ_ONCE(filter->nr_systemwide))
+ return true;
+
read_lock(&filter->rwlock);
ret = __uprobe_perf_filter(filter, mm);
read_unlock(&filter->rwlock);
@@ -1335,10 +1390,11 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
static void __uprobe_perf_func(struct trace_uprobe *tu,
unsigned long func, struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb, int dsize)
+ struct uprobe_cpu_buffer **ucbp)
{
struct trace_event_call *call = trace_probe_event_call(&tu->tp);
struct uprobe_trace_entry_head *entry;
+ struct uprobe_cpu_buffer *ucb;
struct hlist_head *head;
void *data;
int size, esize;
@@ -1346,9 +1402,13 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
#ifdef CONFIG_BPF_EVENTS
if (bpf_prog_array_valid(call)) {
+ const struct bpf_prog_array *array;
u32 ret;
- ret = bpf_prog_run_array_uprobe(call->prog_array, regs, bpf_prog_run);
+ rcu_read_lock_trace();
+ array = rcu_dereference_check(call->prog_array, rcu_read_lock_trace_held());
+ ret = bpf_prog_run_array_uprobe(array, regs, bpf_prog_run);
+ rcu_read_unlock_trace();
if (!ret)
return;
}
@@ -1356,7 +1416,8 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
- size = esize + tu->tp.size + dsize;
+ ucb = prepare_uprobe_buffer(tu, regs, ucbp);
+ size = esize + ucb->dsize;
size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32);
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
return;
@@ -1379,13 +1440,10 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
data = DATAOF_TRACE_ENTRY(entry, false);
}
- memcpy(data, ucb->buf, tu->tp.size + dsize);
+ memcpy(data, ucb->buf, ucb->dsize);
- if (size - esize > tu->tp.size + dsize) {
- int len = tu->tp.size + dsize;
-
- memset(data + len, 0, size - esize - len);
- }
+ if (size - esize > ucb->dsize)
+ memset(data + ucb->dsize, 0, size - esize - ucb->dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
@@ -1395,21 +1453,21 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
/* uprobe profile handler */
static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb, int dsize)
+ struct uprobe_cpu_buffer **ucbp)
{
- if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
+ if (!uprobe_perf_filter(&tu->consumer, current->mm))
return UPROBE_HANDLER_REMOVE;
if (!is_ret_probe(tu))
- __uprobe_perf_func(tu, 0, regs, ucb, dsize);
+ __uprobe_perf_func(tu, 0, regs, ucbp);
return 0;
}
static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb, int dsize)
+ struct uprobe_cpu_buffer **ucbp)
{
- __uprobe_perf_func(tu, func, regs, ucb, dsize);
+ __uprobe_perf_func(tu, func, regs, ucbp);
}
int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
@@ -1431,7 +1489,7 @@ int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
: BPF_FD_TYPE_UPROBE;
*filename = tu->filename;
*probe_offset = tu->offset;
- *probe_addr = 0;
+ *probe_addr = tu->ref_ctr_offset;
return 0;
}
#endif /* CONFIG_PERF_EVENTS */
@@ -1470,17 +1528,17 @@ trace_uprobe_register(struct trace_event_call *event, enum trace_reg type,
}
}
-static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
+static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs,
+ __u64 *data)
{
struct trace_uprobe *tu;
struct uprobe_dispatch_data udd;
- struct uprobe_cpu_buffer *ucb;
- int dsize, esize;
+ struct uprobe_cpu_buffer *ucb = NULL;
int ret = 0;
-
tu = container_of(con, struct trace_uprobe, consumer);
- tu->nhit++;
+
+ this_cpu_inc(*tu->nhits);
udd.tu = tu;
udd.bp_addr = instruction_pointer(regs);
@@ -1490,30 +1548,24 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
if (WARN_ON_ONCE(!uprobe_cpu_buffer))
return 0;
- dsize = __get_data_size(&tu->tp, regs, NULL);
- esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
-
- ucb = uprobe_buffer_get();
- store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize);
-
if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
- ret |= uprobe_trace_func(tu, regs, ucb, dsize);
+ ret |= uprobe_trace_func(tu, regs, &ucb);
#ifdef CONFIG_PERF_EVENTS
if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE))
- ret |= uprobe_perf_func(tu, regs, ucb, dsize);
+ ret |= uprobe_perf_func(tu, regs, &ucb);
#endif
uprobe_buffer_put(ucb);
return ret;
}
static int uretprobe_dispatcher(struct uprobe_consumer *con,
- unsigned long func, struct pt_regs *regs)
+ unsigned long func, struct pt_regs *regs,
+ __u64 *data)
{
struct trace_uprobe *tu;
struct uprobe_dispatch_data udd;
- struct uprobe_cpu_buffer *ucb;
- int dsize, esize;
+ struct uprobe_cpu_buffer *ucb = NULL;
tu = container_of(con, struct trace_uprobe, consumer);
@@ -1525,18 +1577,12 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
if (WARN_ON_ONCE(!uprobe_cpu_buffer))
return 0;
- dsize = __get_data_size(&tu->tp, regs, NULL);
- esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
-
- ucb = uprobe_buffer_get();
- store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize);
-
if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
- uretprobe_trace_func(tu, func, regs, ucb, dsize);
+ uretprobe_trace_func(tu, func, regs, &ucb);
#ifdef CONFIG_PERF_EVENTS
if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE))
- uretprobe_perf_func(tu, func, regs, ucb, dsize);
+ uretprobe_perf_func(tu, func, regs, &ucb);
#endif
uprobe_buffer_put(ucb);
return 0;
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index a4dcf0f24352..1921ade45be3 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -454,7 +454,7 @@ static struct tracing_map_elt *get_free_elt(struct tracing_map *map)
struct tracing_map_elt *elt = NULL;
int idx;
- idx = atomic_inc_return(&map->next_elt);
+ idx = atomic_fetch_add_unless(&map->next_elt, 1, map->max_elts);
if (idx < map->max_elts) {
elt = *(TRACING_MAP_ELT(map->elts, idx));
if (map->ops && map->ops->elt_init)
@@ -699,7 +699,7 @@ void tracing_map_clear(struct tracing_map *map)
{
unsigned int i;
- atomic_set(&map->next_elt, -1);
+ atomic_set(&map->next_elt, 0);
atomic64_set(&map->hits, 0);
atomic64_set(&map->drops, 0);
@@ -783,7 +783,7 @@ struct tracing_map *tracing_map_create(unsigned int map_bits,
map->map_bits = map_bits;
map->max_elts = (1 << map_bits);
- atomic_set(&map->next_elt, -1);
+ atomic_set(&map->next_elt, 0);
map->map_size = (1 << (map_bits + 1));
map->ops = ops;
@@ -845,15 +845,11 @@ int tracing_map_init(struct tracing_map *map)
static int cmp_entries_dup(const void *A, const void *B)
{
const struct tracing_map_sort_entry *a, *b;
- int ret = 0;
a = *(const struct tracing_map_sort_entry **)A;
b = *(const struct tracing_map_sort_entry **)B;
- if (memcmp(a->key, b->key, a->elt->map->key_size))
- ret = 1;
-
- return ret;
+ return memcmp(a->key, b->key, a->elt->map->key_size);
}
static int cmp_entries_sum(const void *A, const void *B)