diff options
Diffstat (limited to 'kernel/trace')
62 files changed, 9201 insertions, 3674 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 61c541c36596..d570b8b9c0a9 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -31,9 +31,14 @@ config HAVE_FUNCTION_GRAPH_TRACER help See Documentation/trace/ftrace-design.rst -config HAVE_FUNCTION_GRAPH_RETVAL +config HAVE_FUNCTION_GRAPH_FREGS bool +config HAVE_FTRACE_GRAPH_FUNC + bool + help + True if ftrace_graph_func() is defined. + config HAVE_DYNAMIC_FTRACE bool help @@ -57,6 +62,12 @@ config HAVE_DYNAMIC_FTRACE_WITH_ARGS This allows for use of ftrace_regs_get_argument() and ftrace_regs_get_stack_pointer(). +config HAVE_FTRACE_REGS_HAVING_PT_REGS + bool + help + If this is set, ftrace_regs has pt_regs, thus it can convert to + pt_regs without allocating memory. + config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE bool help @@ -163,7 +174,7 @@ config TRACING select BINARY_PRINTF select EVENT_TRACING select TRACE_CLOCK - select TASKS_RCU if PREEMPTION + select NEED_TASKS_RCU config GENERIC_TRACER bool @@ -204,7 +215,7 @@ config FUNCTION_TRACER select GENERIC_TRACER select CONTEXT_SWITCH_TRACER select GLOB - select TASKS_RCU if PREEMPTION + select NEED_TASKS_RCU select TASKS_RUDE_RCU help Enable the kernel to trace every kernel function. This is done @@ -232,7 +243,7 @@ config FUNCTION_GRAPH_TRACER config FUNCTION_GRAPH_RETVAL bool "Kernel Function Graph Return Value" - depends on HAVE_FUNCTION_GRAPH_RETVAL + depends on HAVE_FUNCTION_GRAPH_FREGS depends on FUNCTION_GRAPH_TRACER default n help @@ -242,6 +253,16 @@ config FUNCTION_GRAPH_RETVAL enable it via the trace option funcgraph-retval. See Documentation/trace/ftrace.rst +config FUNCTION_GRAPH_RETADDR + bool "Kernel Function Graph Return Address" + depends on FUNCTION_GRAPH_TRACER + default n + help + Support recording and printing the function return address when + using function graph tracer. It can be helpful to locate code line that + the function is called. This feature is off by default, and you can + enable it via the trace option funcgraph-retaddr. + config DYNAMIC_FTRACE bool "enable/disable function tracing dynamically" depends on FUNCTION_TRACER @@ -286,10 +307,9 @@ config DYNAMIC_FTRACE_WITH_ARGS config FPROBE bool "Kernel Function Probe (fprobe)" - depends on FUNCTION_TRACER - depends on DYNAMIC_FTRACE_WITH_REGS - depends on HAVE_RETHOOK - select RETHOOK + depends on HAVE_FUNCTION_GRAPH_FREGS && HAVE_FTRACE_GRAPH_FUNC + depends on DYNAMIC_FTRACE_WITH_ARGS + select FUNCTION_GRAPH_TRACER default n help This option enables kernel function probe (fprobe) based on ftrace. @@ -965,7 +985,7 @@ config FTRACE_RECORD_RECURSION config FTRACE_RECORD_RECURSION_SIZE int "Max number of recursed functions to record" - default 128 + default 128 depends on FTRACE_RECORD_RECURSION help This defines the limit of number of functions that can be @@ -974,6 +994,19 @@ config FTRACE_RECORD_RECURSION_SIZE This file can be reset, but the limit can not change in size at runtime. +config FTRACE_VALIDATE_RCU_IS_WATCHING + bool "Validate RCU is on during ftrace execution" + depends on FUNCTION_TRACER + depends on ARCH_WANTS_NO_INSTR + help + All callbacks that attach to the function tracing have some sort of + protection against recursion. This option is only to verify that + ftrace (and other users of ftrace_test_recursion_trylock()) are not + called outside of RCU, as if they are, it can cause a race. But it + also has a noticeable overhead when enabled. + + If unsure, say N + config RING_BUFFER_RECORD_RECURSION bool "Record functions that recurse in the ring buffer" depends on FTRACE_RECORD_RECURSION @@ -1123,7 +1156,7 @@ config PREEMPTIRQ_DELAY_TEST config SYNTH_EVENT_GEN_TEST tristate "Test module for in-kernel synthetic event generation" - depends on SYNTH_EVENTS + depends on SYNTH_EVENTS && m help This option creates a test module to check the base functionality of in-kernel synthetic event definition and @@ -1136,7 +1169,7 @@ config SYNTH_EVENT_GEN_TEST config KPROBE_EVENT_GEN_TEST tristate "Test module for in-kernel kprobe event generation" - depends on KPROBE_EVENTS + depends on KPROBE_EVENTS && m help This option creates a test module to check the base functionality of in-kernel kprobe event definition. diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d5d94510afd3..3679a6d18934 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -524,8 +524,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!buts->buf_size || !buts->buf_nr) return -EINVAL; - strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); - buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; + strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE); /* * some device names have larger paths - convert the slashes @@ -618,8 +617,9 @@ err: return ret; } -static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, char __user *arg) +int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, + char __user *arg) { struct blk_user_trace_setup buts; int ret; @@ -628,29 +628,18 @@ static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (ret) return -EFAULT; + mutex_lock(&q->debugfs_mutex); ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + mutex_unlock(&q->debugfs_mutex); if (ret) return ret; if (copy_to_user(arg, &buts, sizeof(buts))) { - __blk_trace_remove(q); + blk_trace_remove(q); return -EFAULT; } return 0; } - -int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, - char __user *arg) -{ - int ret; - - mutex_lock(&q->debugfs_mutex); - ret = __blk_trace_setup(q, name, dev, bdev, arg); - mutex_unlock(&q->debugfs_mutex); - - return ret; -} EXPORT_SYMBOL_GPL(blk_trace_setup); #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) @@ -674,12 +663,14 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, .pid = cbuts.pid, }; + mutex_lock(&q->debugfs_mutex); ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + mutex_unlock(&q->debugfs_mutex); if (ret) return ret; if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { - __blk_trace_remove(q); + blk_trace_remove(q); return -EFAULT; } @@ -733,12 +724,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) int ret, start = 0; char b[BDEVNAME_SIZE]; - mutex_lock(&q->debugfs_mutex); - switch (cmd) { case BLKTRACESETUP: snprintf(b, sizeof(b), "%pg", bdev); - ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); + ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) case BLKTRACESETUP32: @@ -750,17 +739,15 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) start = 1; fallthrough; case BLKTRACESTOP: - ret = __blk_trace_startstop(q, start); + ret = blk_trace_startstop(q, start); break; case BLKTRACETEARDOWN: - ret = __blk_trace_remove(q); + ret = blk_trace_remove(q); break; default: ret = -ENOTTY; break; } - - mutex_unlock(&q->debugfs_mutex); return ret; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7ac6c52b25eb..adc947587eb8 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -24,7 +24,6 @@ #include <linux/key.h> #include <linux/verification.h> #include <linux/namei.h> -#include <linux/fileattr.h> #include <net/bpf_sk_storage.h> @@ -358,17 +357,6 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { .arg3_type = ARG_CONST_SIZE, }; -static const struct bpf_func_proto *bpf_get_probe_write_proto(void) -{ - if (!capable(CAP_SYS_ADMIN)) - return NULL; - - pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!", - current->comm, task_pid_nr(current)); - - return &bpf_probe_write_user_proto; -} - #define MAX_TRACE_PRINTK_VARARGS 3 #define BPF_TRACE_PRINTK_SIZE 1024 @@ -620,7 +608,8 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, - u64 flags, struct perf_sample_data *sd) + u64 flags, struct perf_raw_record *raw, + struct perf_sample_data *sd) { struct bpf_array *array = container_of(map, struct bpf_array, map); unsigned int cpu = smp_processor_id(); @@ -645,6 +634,8 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; + perf_sample_save_raw_data(sd, event, raw); + return perf_event_output(event, sd, regs); } @@ -688,9 +679,8 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, } perf_sample_data_init(sd, 0, 0); - perf_sample_save_raw_data(sd, &raw); - err = __bpf_perf_event_output(regs, map, flags, sd); + err = __bpf_perf_event_output(regs, map, flags, &raw, sd); out: this_cpu_dec(bpf_trace_nest_level); preempt_enable(); @@ -749,9 +739,8 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, perf_fetch_caller_regs(regs); perf_sample_data_init(sd, 0, 0); - perf_sample_save_raw_data(sd, &raw); - ret = __bpf_perf_event_output(regs, map, flags, sd); + ret = __bpf_perf_event_output(regs, map, flags, &raw, sd); out: this_cpu_dec(bpf_event_output_nest_level); preempt_enable(); @@ -798,34 +787,13 @@ const struct bpf_func_proto bpf_task_pt_regs_proto = { .ret_btf_id = &bpf_task_pt_regs_ids[0], }; -BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) -{ - struct bpf_array *array = container_of(map, struct bpf_array, map); - struct cgroup *cgrp; - - if (unlikely(idx >= array->map.max_entries)) - return -E2BIG; - - cgrp = READ_ONCE(array->ptrs[idx]); - if (unlikely(!cgrp)) - return -EAGAIN; - - return task_under_cgroup_hierarchy(current, cgrp); -} - -static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { - .func = bpf_current_task_under_cgroup, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_ANYTHING, -}; - struct send_signal_irq_work { struct irq_work irq_work; struct task_struct *task; u32 sig; enum pid_type type; + bool has_siginfo; + struct kernel_siginfo info; }; static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work); @@ -833,30 +801,49 @@ static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work); static void do_bpf_send_signal(struct irq_work *entry) { struct send_signal_irq_work *work; + struct kernel_siginfo *siginfo; work = container_of(entry, struct send_signal_irq_work, irq_work); - group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type); + siginfo = work->has_siginfo ? &work->info : SEND_SIG_PRIV; + + group_send_sig_info(work->sig, siginfo, work->task, work->type); put_task_struct(work->task); } -static int bpf_send_signal_common(u32 sig, enum pid_type type) +static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struct *task, u64 value) { struct send_signal_irq_work *work = NULL; + struct kernel_siginfo info; + struct kernel_siginfo *siginfo; + + if (!task) { + task = current; + siginfo = SEND_SIG_PRIV; + } else { + clear_siginfo(&info); + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_KERNEL; + info.si_pid = 0; + info.si_uid = 0; + info.si_value.sival_ptr = (void *)(unsigned long)value; + siginfo = &info; + } /* Similar to bpf_probe_write_user, task needs to be * in a sound condition and kernel memory access be * permitted in order to send signal to the current * task. */ - if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING))) + if (unlikely(task->flags & (PF_KTHREAD | PF_EXITING))) return -EPERM; if (unlikely(!nmi_uaccess_okay())) return -EPERM; /* Task should not be pid=1 to avoid kernel panic. */ - if (unlikely(is_global_init(current))) + if (unlikely(is_global_init(task))) return -EPERM; - if (irqs_disabled()) { + if (!preemptible()) { /* Do an early check on signal validity. Otherwise, * the error is lost in deferred irq_work. */ @@ -871,19 +858,22 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type) * to the irq_work. The current task may change when queued * irq works get executed. */ - work->task = get_task_struct(current); + work->task = get_task_struct(task); + work->has_siginfo = siginfo == &info; + if (work->has_siginfo) + copy_siginfo(&work->info, &info); work->sig = sig; work->type = type; irq_work_queue(&work->irq_work); return 0; } - return group_send_sig_info(sig, SEND_SIG_PRIV, current, type); + return group_send_sig_info(sig, siginfo, task, type); } BPF_CALL_1(bpf_send_signal, u32, sig) { - return bpf_send_signal_common(sig, PIDTYPE_TGID); + return bpf_send_signal_common(sig, PIDTYPE_TGID, NULL, 0); } static const struct bpf_func_proto bpf_send_signal_proto = { @@ -895,7 +885,7 @@ static const struct bpf_func_proto bpf_send_signal_proto = { BPF_CALL_1(bpf_send_signal_thread, u32, sig) { - return bpf_send_signal_common(sig, PIDTYPE_PID); + return bpf_send_signal_common(sig, PIDTYPE_PID, NULL, 0); } static const struct bpf_func_proto bpf_send_signal_thread_proto = { @@ -1053,9 +1043,15 @@ static unsigned long get_entry_ip(unsigned long fentry_ip) { u32 instr; - /* Being extra safe in here in case entry ip is on the page-edge. */ - if (get_kernel_nofault(instr, (u32 *) fentry_ip - 1)) - return fentry_ip; + /* We want to be extra safe in case entry ip is on the page edge, + * but otherwise we need to avoid get_kernel_nofault()'s overhead. + */ + if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) { + if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE))) + return fentry_ip; + } else { + instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE); + } if (is_endbr(instr)) fentry_ip -= ENDBR_INSN_SIZE; return fentry_ip; @@ -1182,9 +1178,6 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = { BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags) { -#ifndef CONFIG_X86 - return -ENOENT; -#else static const u32 br_entry_size = sizeof(struct perf_branch_entry); u32 entry_cnt = size / br_entry_size; @@ -1197,7 +1190,6 @@ BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags) return -ENOENT; return entry_cnt * br_entry_size; -#endif } static const struct bpf_func_proto bpf_get_branch_snapshot_proto = { @@ -1224,7 +1216,8 @@ static const struct bpf_func_proto bpf_get_func_arg_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_LONG, + .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, + .arg3_size = sizeof(u64), }; BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value) @@ -1240,7 +1233,8 @@ static const struct bpf_func_proto bpf_get_func_ret_proto = { .func = get_func_ret, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_LONG, + .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, + .arg2_size = sizeof(u64), }; BPF_CALL_1(get_func_arg_cnt, void *, ctx) @@ -1367,8 +1361,8 @@ __bpf_kfunc void bpf_key_put(struct bpf_key *bkey) #ifdef CONFIG_SYSTEM_DATA_VERIFICATION /** * bpf_verify_pkcs7_signature - verify a PKCS#7 signature - * @data_ptr: data to verify - * @sig_ptr: signature of the data + * @data_p: data to verify + * @sig_p: signature of the data * @trusted_keyring: keyring with keys trusted for signature verification * * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr* @@ -1376,10 +1370,12 @@ __bpf_kfunc void bpf_key_put(struct bpf_key *bkey) * * Return: 0 on success, a negative value on error. */ -__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, - struct bpf_dynptr_kern *sig_ptr, +__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, + struct bpf_dynptr *sig_p, struct bpf_key *trusted_keyring) { + struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; + struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; const void *data, *sig; u32 data_len, sig_len; int ret; @@ -1412,14 +1408,14 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, __bpf_kfunc_end_defs(); -BTF_SET8_START(key_sig_kfunc_set) +BTF_KFUNCS_START(key_sig_kfunc_set) BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) #ifdef CONFIG_SYSTEM_DATA_VERIFICATION BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) #endif -BTF_SET8_END(key_sig_kfunc_set) +BTF_KFUNCS_END(key_sig_kfunc_set) static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = { .owner = THIS_MODULE, @@ -1435,75 +1431,11 @@ static int __init bpf_key_sig_kfuncs_init(void) late_initcall(bpf_key_sig_kfuncs_init); #endif /* CONFIG_KEYS */ -/* filesystem kfuncs */ -__bpf_kfunc_start_defs(); - -/** - * bpf_get_file_xattr - get xattr of a file - * @file: file to get xattr from - * @name__str: name of the xattr - * @value_ptr: output buffer of the xattr value - * - * Get xattr *name__str* of *file* and store the output in *value_ptr*. - * - * For security reasons, only *name__str* with prefix "user." is allowed. - * - * Return: 0 on success, a negative value on error. - */ -__bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str, - struct bpf_dynptr_kern *value_ptr) -{ - struct dentry *dentry; - u32 value_len; - void *value; - int ret; - - if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) - return -EPERM; - - value_len = __bpf_dynptr_size(value_ptr); - value = __bpf_dynptr_data_rw(value_ptr, value_len); - if (!value) - return -EINVAL; - - dentry = file_dentry(file); - ret = inode_permission(&nop_mnt_idmap, dentry->d_inode, MAY_READ); - if (ret) - return ret; - return __vfs_getxattr(dentry, dentry->d_inode, name__str, value, value_len); -} - -__bpf_kfunc_end_defs(); - -BTF_SET8_START(fs_kfunc_set_ids) -BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_SET8_END(fs_kfunc_set_ids) - -static int bpf_get_file_xattr_filter(const struct bpf_prog *prog, u32 kfunc_id) -{ - if (!btf_id_set8_contains(&fs_kfunc_set_ids, kfunc_id)) - return 0; - - /* Only allow to attach from LSM hooks, to avoid recursion */ - return prog->type != BPF_PROG_TYPE_LSM ? -EACCES : 0; -} - -static const struct btf_kfunc_id_set bpf_fs_kfunc_set = { - .owner = THIS_MODULE, - .set = &fs_kfunc_set_ids, - .filter = bpf_get_file_xattr_filter, -}; - -static int __init bpf_fs_kfuncs_init(void) -{ - return register_btf_kfunc_id_set(BPF_PROG_TYPE_LSM, &bpf_fs_kfunc_set); -} - -late_initcall(bpf_fs_kfuncs_init); - static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + switch (func_id) { case BPF_FUNC_map_lookup_elem: return &bpf_map_lookup_elem_proto; @@ -1525,8 +1457,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_current_task: return &bpf_get_current_task_proto; case BPF_FUNC_get_current_task_btf: @@ -1545,13 +1475,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_numa_node_id_proto; case BPF_FUNC_perf_event_read: return &bpf_perf_event_read_proto; - case BPF_FUNC_current_task_under_cgroup: - return &bpf_current_task_under_cgroup_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; - case BPF_FUNC_probe_write_user: - return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ? - NULL : bpf_get_probe_write_proto(); case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: @@ -1575,6 +1500,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_cgrp_storage_get_proto; case BPF_FUNC_cgrp_storage_delete: return &bpf_cgrp_storage_delete_proto; + case BPF_FUNC_current_task_under_cgroup: + return &bpf_current_task_under_cgroup_proto; #endif case BPF_FUNC_send_signal: return &bpf_send_signal_proto; @@ -1582,8 +1509,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_send_signal_thread_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; - case BPF_FUNC_get_ns_current_pid_tgid: - return &bpf_get_ns_current_pid_tgid_proto; case BPF_FUNC_ringbuf_output: return &bpf_ringbuf_output_proto; case BPF_FUNC_ringbuf_reserve: @@ -1597,7 +1522,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; case BPF_FUNC_get_task_stack: - return &bpf_get_task_stack_proto; + return prog->sleepable ? &bpf_get_task_stack_sleepable_proto + : &bpf_get_task_stack_proto; case BPF_FUNC_copy_from_user: return &bpf_copy_from_user_proto; case BPF_FUNC_copy_from_user_task: @@ -1629,8 +1555,45 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_trace_vprintk: return bpf_get_trace_vprintk_proto(); default: - return bpf_base_func_proto(func_id); + break; } + + func_proto = bpf_base_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + if (!bpf_token_capable(prog->aux->token, CAP_SYS_ADMIN)) + return NULL; + + switch (func_id) { + case BPF_FUNC_probe_write_user: + return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ? + NULL : &bpf_probe_write_user_proto; + default: + return NULL; + } +} + +static bool is_kprobe_multi(const struct bpf_prog *prog) +{ + return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI || + prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; +} + +static inline bool is_kprobe_session(const struct bpf_prog *prog) +{ + return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; +} + +static inline bool is_uprobe_multi(const struct bpf_prog *prog) +{ + return prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI || + prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION; +} + +static inline bool is_uprobe_session(const struct bpf_prog *prog) +{ + return prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION; } static const struct bpf_func_proto * @@ -1642,21 +1605,21 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto; case BPF_FUNC_get_stack: - return &bpf_get_stack_proto; + return prog->sleepable ? &bpf_get_stack_sleepable_proto : &bpf_get_stack_proto; #ifdef CONFIG_BPF_KPROBE_OVERRIDE case BPF_FUNC_override_return: return &bpf_override_return_proto; #endif case BPF_FUNC_get_func_ip: - if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI) + if (is_kprobe_multi(prog)) return &bpf_get_func_ip_proto_kprobe_multi; - if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI) + if (is_uprobe_multi(prog)) return &bpf_get_func_ip_proto_uprobe_multi; return &bpf_get_func_ip_proto_kprobe; case BPF_FUNC_get_attach_cookie: - if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI) + if (is_kprobe_multi(prog)) return &bpf_get_attach_cookie_proto_kmulti; - if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI) + if (is_uprobe_multi(prog)) return &bpf_get_attach_cookie_proto_umulti; return &bpf_get_attach_cookie_proto_trace; default: @@ -2008,6 +1971,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_stackid_proto_raw_tp; case BPF_FUNC_get_stack: return &bpf_get_stack_proto_raw_tp; + case BPF_FUNC_get_attach_cookie: + return &bpf_get_attach_cookie_proto_tracing; default: return bpf_tracing_func_proto(func_id, prog); } @@ -2070,6 +2035,9 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_func_arg_cnt: return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL; case BPF_FUNC_get_attach_cookie: + if (prog->type == BPF_PROG_TYPE_TRACING && + prog->expected_attach_type == BPF_TRACE_RAW_TP) + return &bpf_get_attach_cookie_proto_tracing; return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL; default: fn = raw_tp_prog_func_proto(func_id, prog); @@ -2278,6 +2246,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event) { struct bpf_prog_array *old_array; struct bpf_prog_array *new_array; + struct bpf_prog *prog = NULL; int ret; mutex_lock(&bpf_event_mutex); @@ -2286,9 +2255,10 @@ void perf_event_detach_bpf_prog(struct perf_event *event) goto unlock; old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); + if (!old_array) + goto put; + ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array); - if (ret == -ENOENT) - goto unlock; if (ret < 0) { bpf_prog_array_delete_safe(old_array, event->prog); } else { @@ -2296,11 +2266,23 @@ void perf_event_detach_bpf_prog(struct perf_event *event) bpf_prog_array_free_sleepable(old_array); } - bpf_prog_put(event->prog); +put: + prog = event->prog; event->prog = NULL; unlock: mutex_unlock(&bpf_event_mutex); + + if (prog) { + /* + * It could be that the bpf_prog is not sleepable (and will be freed + * via normal RCU), but is called from a point that supports sleepable + * programs and uses tasks-trace-RCU. + */ + synchronize_rcu_tasks_trace(); + + bpf_prog_put(prog); + } } int perf_event_query_prog_array(struct perf_event *event, void __user *info) @@ -2370,16 +2352,26 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) } static __always_inline -void __bpf_trace_run(struct bpf_prog *prog, u64 *args) +void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) { + struct bpf_prog *prog = link->link.prog; + struct bpf_run_ctx *old_run_ctx; + struct bpf_trace_run_ctx run_ctx; + cant_sleep(); if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); goto out; } + + run_ctx.bpf_cookie = link->cookie; + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + rcu_read_lock(); (void) bpf_prog_run(prog, args); rcu_read_unlock(); + + bpf_reset_run_ctx(old_run_ctx); out: this_cpu_dec(*(prog->active)); } @@ -2408,12 +2400,12 @@ out: #define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 #define BPF_TRACE_DEFN_x(x) \ - void bpf_trace_run##x(struct bpf_prog *prog, \ + void bpf_trace_run##x(struct bpf_raw_tp_link *link, \ REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \ { \ u64 args[x]; \ REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \ - __bpf_trace_run(prog, args); \ + __bpf_trace_run(link, args); \ } \ EXPORT_SYMBOL_GPL(bpf_trace_run##x) BPF_TRACE_DEFN_x(1); @@ -2429,9 +2421,10 @@ BPF_TRACE_DEFN_x(10); BPF_TRACE_DEFN_x(11); BPF_TRACE_DEFN_x(12); -static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link) { struct tracepoint *tp = btp->tp; + struct bpf_prog *prog = link->link.prog; /* * check that program doesn't access arguments beyond what's @@ -2443,18 +2436,12 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog * if (prog->aux->max_tp_access > btp->writable_size) return -EINVAL; - return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, - prog); + return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, link); } -int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link) { - return __bpf_probe_register(btp, prog); -} - -int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) -{ - return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); + return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, link); } int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, @@ -2577,6 +2564,12 @@ static int __init bpf_event_init(void) fs_initcall(bpf_event_init); #endif /* CONFIG_MODULES */ +struct bpf_session_run_ctx { + struct bpf_run_ctx run_ctx; + bool is_return; + void *data; +}; + #ifdef CONFIG_FPROBE struct bpf_kprobe_multi_link { struct bpf_link link; @@ -2590,7 +2583,7 @@ struct bpf_kprobe_multi_link { }; struct bpf_kprobe_multi_run_ctx { - struct bpf_run_ctx run_ctx; + struct bpf_session_run_ctx session_ctx; struct bpf_kprobe_multi_link *link; unsigned long entry_ip; }; @@ -2600,6 +2593,20 @@ struct user_syms { char *buf; }; +#ifndef CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS +static DEFINE_PER_CPU(struct pt_regs, bpf_kprobe_multi_pt_regs); +#define bpf_kprobe_multi_pt_regs_ptr() this_cpu_ptr(&bpf_kprobe_multi_pt_regs) +#else +#define bpf_kprobe_multi_pt_regs_ptr() (NULL) +#endif + +static unsigned long ftrace_get_entry_ip(unsigned long fentry_ip) +{ + unsigned long ip = ftrace_get_symaddr(fentry_ip); + + return ip ? : fentry_ip; +} + static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 cnt) { unsigned long __user usymbol; @@ -2679,6 +2686,7 @@ static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link) static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { + u64 __user *ucookies = u64_to_user_ptr(info->kprobe_multi.cookies); u64 __user *uaddrs = u64_to_user_ptr(info->kprobe_multi.addrs); struct bpf_kprobe_multi_link *kmulti_link; u32 ucount = info->kprobe_multi.count; @@ -2686,6 +2694,8 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, if (!uaddrs ^ !ucount) return -EINVAL; + if (ucookies && !ucount) + return -EINVAL; kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); info->kprobe_multi.count = kmulti_link->cnt; @@ -2699,6 +2709,18 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, else ucount = kmulti_link->cnt; + if (ucookies) { + if (kmulti_link->cookies) { + if (copy_to_user(ucookies, kmulti_link->cookies, ucount * sizeof(u64))) + return -EFAULT; + } else { + for (i = 0; i < ucount; i++) { + if (put_user(0, ucookies + i)) + return -EFAULT; + } + } + } + if (kallsyms_show_value(current_cred())) { if (copy_to_user(uaddrs, kmulti_link->addrs, ucount * sizeof(u64))) return -EFAULT; @@ -2713,7 +2735,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { .release = bpf_kprobe_multi_link_release, - .dealloc = bpf_kprobe_multi_link_dealloc, + .dealloc_deferred = bpf_kprobe_multi_link_dealloc, .fill_link_info = bpf_kprobe_multi_link_fill_link_info, }; @@ -2754,7 +2776,8 @@ static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx) if (WARN_ON_ONCE(!ctx)) return 0; - run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx); + run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, + session_ctx.run_ctx); link = run_ctx->link; if (!link->cookies) return 0; @@ -2771,30 +2794,38 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) { struct bpf_kprobe_multi_run_ctx *run_ctx; - run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx); + run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, + session_ctx.run_ctx); return run_ctx->entry_ip; } static int kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, - unsigned long entry_ip, struct pt_regs *regs) + unsigned long entry_ip, struct ftrace_regs *fregs, + bool is_return, void *data) { struct bpf_kprobe_multi_run_ctx run_ctx = { + .session_ctx = { + .is_return = is_return, + .data = data, + }, .link = link, .entry_ip = entry_ip, }; struct bpf_run_ctx *old_run_ctx; + struct pt_regs *regs; int err; if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { bpf_prog_inc_misses_counter(link->link.prog); - err = 0; + err = 1; goto out; } migrate_disable(); rcu_read_lock(); - old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + regs = ftrace_partial_regs(fregs, bpf_kprobe_multi_pt_regs_ptr()); + old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx); err = bpf_prog_run(link->link.prog, regs); bpf_reset_run_ctx(old_run_ctx); rcu_read_unlock(); @@ -2807,25 +2838,28 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, static int kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *data) { struct bpf_kprobe_multi_link *link; + int err; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs); - return 0; + err = kprobe_multi_link_prog_run(link, ftrace_get_entry_ip(fentry_ip), + fregs, false, data); + return is_kprobe_session(link->link.prog) ? err : 0; } static void kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *data) { struct bpf_kprobe_multi_link *link; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs); + kprobe_multi_link_prog_run(link, ftrace_get_entry_ip(fentry_ip), + fregs, true, data); } static int symbols_cmp_r(const void *a, const void *b, const void *priv) @@ -2958,7 +2992,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; - if (prog->expected_attach_type != BPF_TRACE_KPROBE_MULTI) + if (!is_kprobe_multi(prog)) return -EINVAL; flags = attr->link_create.kprobe_multi.flags; @@ -3039,10 +3073,12 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (err) goto error; - if (flags & BPF_F_KPROBE_MULTI_RETURN) - link->fp.exit_handler = kprobe_multi_link_exit_handler; - else + if (!(flags & BPF_F_KPROBE_MULTI_RETURN)) link->fp.entry_handler = kprobe_multi_link_handler; + if ((flags & BPF_F_KPROBE_MULTI_RETURN) || is_kprobe_session(prog)) + link->fp.exit_handler = kprobe_multi_link_exit_handler; + if (is_kprobe_session(prog)) + link->fp.entry_data_size = sizeof(u64); link->addrs = addrs; link->cookies = cookies; @@ -3107,7 +3143,9 @@ struct bpf_uprobe { loff_t offset; unsigned long ref_ctr_offset; u64 cookie; + struct uprobe *uprobe; struct uprobe_consumer consumer; + bool session; }; struct bpf_uprobe_multi_link { @@ -3120,20 +3158,20 @@ struct bpf_uprobe_multi_link { }; struct bpf_uprobe_multi_run_ctx { - struct bpf_run_ctx run_ctx; + struct bpf_session_run_ctx session_ctx; unsigned long entry_ip; struct bpf_uprobe *uprobe; }; -static void bpf_uprobe_unregister(struct path *path, struct bpf_uprobe *uprobes, - u32 cnt) +static void bpf_uprobe_unregister(struct bpf_uprobe *uprobes, u32 cnt) { u32 i; - for (i = 0; i < cnt; i++) { - uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset, - &uprobes[i].consumer); - } + for (i = 0; i < cnt; i++) + uprobe_unregister_nosync(uprobes[i].uprobe, &uprobes[i].consumer); + + if (cnt) + uprobe_unregister_sync(); } static void bpf_uprobe_multi_link_release(struct bpf_link *link) @@ -3141,7 +3179,10 @@ static void bpf_uprobe_multi_link_release(struct bpf_link *link) struct bpf_uprobe_multi_link *umulti_link; umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); - bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt); + bpf_uprobe_unregister(umulti_link->uprobes, umulti_link->cnt); + if (umulti_link->task) + put_task_struct(umulti_link->task); + path_put(&umulti_link->path); } static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link) @@ -3149,9 +3190,6 @@ static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link) struct bpf_uprobe_multi_link *umulti_link; umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); - if (umulti_link->task) - put_task_struct(umulti_link->task); - path_put(&umulti_link->path); kvfree(umulti_link->uprobes); kfree(umulti_link); } @@ -3167,7 +3205,8 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, struct bpf_uprobe_multi_link *umulti_link; u32 ucount = info->uprobe_multi.count; int err = 0, i; - long left; + char *p, *buf; + long left = 0; if (!upath ^ !upath_size) return -EINVAL; @@ -3181,26 +3220,23 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, info->uprobe_multi.pid = umulti_link->task ? task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; - if (upath) { - char *p, *buf; - - upath_size = min_t(u32, upath_size, PATH_MAX); - - buf = kmalloc(upath_size, GFP_KERNEL); - if (!buf) - return -ENOMEM; - p = d_path(&umulti_link->path, buf, upath_size); - if (IS_ERR(p)) { - kfree(buf); - return PTR_ERR(p); - } - upath_size = buf + upath_size - p; - left = copy_to_user(upath, p, upath_size); + upath_size = upath_size ? min_t(u32, upath_size, PATH_MAX) : PATH_MAX; + buf = kmalloc(upath_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + p = d_path(&umulti_link->path, buf, upath_size); + if (IS_ERR(p)) { kfree(buf); - if (left) - return -EFAULT; - info->uprobe_multi.path_size = upath_size; + return PTR_ERR(p); } + upath_size = buf + upath_size - p; + + if (upath) + left = copy_to_user(upath, p, upath_size); + kfree(buf); + if (left) + return -EFAULT; + info->uprobe_multi.path_size = upath_size; if (!uoffsets && !ucookies && !uref_ctr_offsets) return 0; @@ -3227,25 +3263,30 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { .release = bpf_uprobe_multi_link_release, - .dealloc = bpf_uprobe_multi_link_dealloc, + .dealloc_deferred = bpf_uprobe_multi_link_dealloc, .fill_link_info = bpf_uprobe_multi_link_fill_link_info, }; static int uprobe_prog_run(struct bpf_uprobe *uprobe, unsigned long entry_ip, - struct pt_regs *regs) + struct pt_regs *regs, + bool is_return, void *data) { struct bpf_uprobe_multi_link *link = uprobe->link; struct bpf_uprobe_multi_run_ctx run_ctx = { + .session_ctx = { + .is_return = is_return, + .data = data, + }, .entry_ip = entry_ip, .uprobe = uprobe, }; struct bpf_prog *prog = link->link.prog; - bool sleepable = prog->aux->sleepable; + bool sleepable = prog->sleepable; struct bpf_run_ctx *old_run_ctx; - int err = 0; + int err; - if (link->task && current != link->task) + if (link->task && !same_thread_group(current, link->task)) return 0; if (sleepable) @@ -3255,7 +3296,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe, migrate_disable(); - old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx); err = bpf_prog_run(link->link.prog, regs); bpf_reset_run_ctx(old_run_ctx); @@ -3269,8 +3310,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe, } static bool -uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx ctx, - struct mm_struct *mm) +uprobe_multi_link_filter(struct uprobe_consumer *con, struct mm_struct *mm) { struct bpf_uprobe *uprobe; @@ -3279,28 +3319,36 @@ uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx ctx } static int -uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs) +uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs, + __u64 *data) { struct bpf_uprobe *uprobe; + int ret; uprobe = container_of(con, struct bpf_uprobe, consumer); - return uprobe_prog_run(uprobe, instruction_pointer(regs), regs); + ret = uprobe_prog_run(uprobe, instruction_pointer(regs), regs, false, data); + if (uprobe->session) + return ret ? UPROBE_HANDLER_IGNORE : 0; + return 0; } static int -uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs) +uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs, + __u64 *data) { struct bpf_uprobe *uprobe; uprobe = container_of(con, struct bpf_uprobe, consumer); - return uprobe_prog_run(uprobe, func, regs); + uprobe_prog_run(uprobe, func, regs, true, data); + return 0; } static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) { struct bpf_uprobe_multi_run_ctx *run_ctx; - run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx); + run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, + session_ctx.run_ctx); return run_ctx->entry_ip; } @@ -3308,7 +3356,8 @@ static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx) { struct bpf_uprobe_multi_run_ctx *run_ctx; - run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx); + run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, + session_ctx.run_ctx); return run_ctx->uprobe->cookie; } @@ -3332,7 +3381,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; - if (prog->expected_attach_type != BPF_TRACE_UPROBE_MULTI) + if (!is_uprobe_multi(prog)) return -EINVAL; flags = attr->link_create.uprobe_multi.flags; @@ -3346,8 +3395,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path); uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets); cnt = attr->link_create.uprobe_multi.cnt; + pid = attr->link_create.uprobe_multi.pid; - if (!upath || !uoffsets || !cnt) + if (!upath || !uoffsets || !cnt || pid < 0) return -EINVAL; if (cnt > MAX_UPROBE_MULTI_CNT) return -E2BIG; @@ -3371,11 +3421,8 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr goto error_path_put; } - pid = attr->link_create.uprobe_multi.pid; if (pid) { - rcu_read_lock(); - task = get_pid_task(find_vpid(pid), PIDTYPE_PID); - rcu_read_unlock(); + task = get_pid_task(find_vpid(pid), PIDTYPE_TGID); if (!task) { err = -ESRCH; goto error_path_put; @@ -3410,11 +3457,12 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr uprobes[i].link = link; - if (flags & BPF_F_UPROBE_MULTI_RETURN) - uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler; - else + if (!(flags & BPF_F_UPROBE_MULTI_RETURN)) uprobes[i].consumer.handler = uprobe_multi_link_handler; - + if (flags & BPF_F_UPROBE_MULTI_RETURN || is_uprobe_session(prog)) + uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler; + if (is_uprobe_session(prog)) + uprobes[i].session = true; if (pid) uprobes[i].consumer.filter = uprobe_multi_link_filter; } @@ -3429,22 +3477,26 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr &bpf_uprobe_multi_link_lops, prog); for (i = 0; i < cnt; i++) { - err = uprobe_register_refctr(d_real_inode(link->path.dentry), - uprobes[i].offset, - uprobes[i].ref_ctr_offset, - &uprobes[i].consumer); - if (err) { - bpf_uprobe_unregister(&path, uprobes, i); - goto error_free; + uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry), + uprobes[i].offset, + uprobes[i].ref_ctr_offset, + &uprobes[i].consumer); + if (IS_ERR(uprobes[i].uprobe)) { + err = PTR_ERR(uprobes[i].uprobe); + link->cnt = i; + goto error_unregister; } } err = bpf_link_prime(&link->link, &link_primer); if (err) - goto error_free; + goto error_unregister; return bpf_link_settle(&link_primer); +error_unregister: + bpf_uprobe_unregister(uprobes, link->cnt); + error_free: kvfree(uprobes); kfree(link); @@ -3468,3 +3520,65 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) return 0; } #endif /* CONFIG_UPROBES */ + +__bpf_kfunc_start_defs(); + +__bpf_kfunc bool bpf_session_is_return(void) +{ + struct bpf_session_run_ctx *session_ctx; + + session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx); + return session_ctx->is_return; +} + +__bpf_kfunc __u64 *bpf_session_cookie(void) +{ + struct bpf_session_run_ctx *session_ctx; + + session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx); + return session_ctx->data; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(kprobe_multi_kfunc_set_ids) +BTF_ID_FLAGS(func, bpf_session_is_return) +BTF_ID_FLAGS(func, bpf_session_cookie) +BTF_KFUNCS_END(kprobe_multi_kfunc_set_ids) + +static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id)) + return 0; + + if (!is_kprobe_session(prog) && !is_uprobe_session(prog)) + return -EACCES; + + return 0; +} + +static const struct btf_kfunc_id_set bpf_kprobe_multi_kfunc_set = { + .owner = THIS_MODULE, + .set = &kprobe_multi_kfunc_set_ids, + .filter = bpf_kprobe_multi_filter, +}; + +static int __init bpf_kprobe_multi_kfuncs_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kprobe_multi_kfunc_set); +} + +late_initcall(bpf_kprobe_multi_kfuncs_init); + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid_type type, + u64 value) +{ + if (type != PIDTYPE_PID && type != PIDTYPE_TGID) + return -EINVAL; + + return bpf_send_signal_common(sig, type, task, value); +} + +__bpf_kfunc_end_defs(); diff --git a/kernel/trace/bpf_trace.h b/kernel/trace/bpf_trace.h index 9acbc11ac7bb..c4075b56becc 100644 --- a/kernel/trace/bpf_trace.h +++ b/kernel/trace/bpf_trace.h @@ -19,7 +19,7 @@ TRACE_EVENT(bpf_trace_printk, ), TP_fast_assign( - __assign_str(bpf_string, bpf_string); + __assign_str(bpf_string); ), TP_printk("%s", __get_str(bpf_string)) diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index c83c005e654e..5dddfc2149f6 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -7,9 +7,11 @@ * * Highly modified by Steven Rostedt (VMware). */ +#include <linux/bits.h> #include <linux/jump_label.h> #include <linux/suspend.h> #include <linux/ftrace.h> +#include <linux/static_call.h> #include <linux/slab.h> #include <trace/events/sched.h> @@ -17,19 +19,487 @@ #include "ftrace_internal.h" #include "trace.h" -#ifdef CONFIG_DYNAMIC_FTRACE -#define ASSIGN_OPS_HASH(opsname, val) \ - .func_hash = val, \ - .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), -#else -#define ASSIGN_OPS_HASH(opsname, val) -#endif +/* + * FGRAPH_FRAME_SIZE: Size in bytes of the meta data on the shadow stack + * FGRAPH_FRAME_OFFSET: Size in long words of the meta data frame + */ +#define FGRAPH_FRAME_SIZE sizeof(struct ftrace_ret_stack) +#define FGRAPH_FRAME_OFFSET DIV_ROUND_UP(FGRAPH_FRAME_SIZE, sizeof(long)) + +/* + * On entry to a function (via function_graph_enter()), a new fgraph frame + * (ftrace_ret_stack) is pushed onto the stack as well as a word that + * holds a bitmask and a type (called "bitmap"). The bitmap is defined as: + * + * bits: 0 - 9 offset in words from the previous ftrace_ret_stack + * + * bits: 10 - 11 Type of storage + * 0 - reserved + * 1 - bitmap of fgraph_array index + * 2 - reserved data + * + * For type with "bitmap of fgraph_array index" (FGRAPH_TYPE_BITMAP): + * bits: 12 - 27 The bitmap of fgraph_ops fgraph_array index + * That is, it's a bitmask of 0-15 (16 bits) + * where if a corresponding ops in the fgraph_array[] + * expects a callback from the return of the function + * it's corresponding bit will be set. + * + * + * The top of the ret_stack (when not empty) will always have a reference + * word that points to the last fgraph frame that was saved. + * + * For reserved data: + * bits: 12 - 17 The size in words that is stored + * bits: 18 - 23 The index of fgraph_array, which shows who is stored + * + * That is, at the end of function_graph_enter, if the first and forth + * fgraph_ops on the fgraph_array[] (index 0 and 3) needs their retfunc called + * on the return of the function being traced, and the forth fgraph_ops + * stored two words of data, this is what will be on the task's shadow + * ret_stack: (the stack grows upward) + * + * ret_stack[SHADOW_STACK_OFFSET] + * | SHADOW_STACK_TASK_VARS(ret_stack)[15] | + * ... + * | SHADOW_STACK_TASK_VARS(ret_stack)[0] | + * ret_stack[SHADOW_STACK_MAX_OFFSET] + * ... + * | | <- task->curr_ret_stack + * +--------------------------------------------+ + * | (3 << 12) | (3 << 10) | FGRAPH_FRAME_OFFSET| + * | *or put another way* | + * | (3 << FGRAPH_DATA_INDEX_SHIFT)| \ | This is for fgraph_ops[3]. + * | ((2 - 1) << FGRAPH_DATA_SHIFT)| \ | The data size is 2 words. + * | (FGRAPH_TYPE_DATA << FGRAPH_TYPE_SHIFT)| \ | + * | (offset2:FGRAPH_FRAME_OFFSET+3) | <- the offset2 is from here + * +--------------------------------------------+ ( It is 4 words from the ret_stack) + * | STORED DATA WORD 2 | + * | STORED DATA WORD 1 | + * +--------------------------------------------+ + * | (9 << 12) | (1 << 10) | FGRAPH_FRAME_OFFSET| + * | *or put another way* | + * | (BIT(3)|BIT(0)) << FGRAPH_INDEX_SHIFT | \ | + * | FGRAPH_TYPE_BITMAP << FGRAPH_TYPE_SHIFT| \ | + * | (offset1:FGRAPH_FRAME_OFFSET) | <- the offset1 is from here + * +--------------------------------------------+ + * | struct ftrace_ret_stack | + * | (stores the saved ret pointer) | <- the offset points here + * +--------------------------------------------+ + * | (X) | (N) | ( N words away from + * | | previous ret_stack) + * ... + * ret_stack[0] + * + * If a backtrace is required, and the real return pointer needs to be + * fetched, then it looks at the task's curr_ret_stack offset, if it + * is greater than zero (reserved, or right before popped), it would mask + * the value by FGRAPH_FRAME_OFFSET_MASK to get the offset of the + * ftrace_ret_stack structure stored on the shadow stack. + */ + +/* + * The following is for the top word on the stack: + * + * FGRAPH_FRAME_OFFSET (0-9) holds the offset delta to the fgraph frame + * FGRAPH_TYPE (10-11) holds the type of word this is. + * (RESERVED or BITMAP) + */ +#define FGRAPH_FRAME_OFFSET_BITS 10 +#define FGRAPH_FRAME_OFFSET_MASK GENMASK(FGRAPH_FRAME_OFFSET_BITS - 1, 0) + +#define FGRAPH_TYPE_BITS 2 +#define FGRAPH_TYPE_MASK GENMASK(FGRAPH_TYPE_BITS - 1, 0) +#define FGRAPH_TYPE_SHIFT FGRAPH_FRAME_OFFSET_BITS + +enum { + FGRAPH_TYPE_RESERVED = 0, + FGRAPH_TYPE_BITMAP = 1, + FGRAPH_TYPE_DATA = 2, +}; + +/* + * For BITMAP type: + * FGRAPH_INDEX (12-27) bits holding the gops index wanting return callback called + */ +#define FGRAPH_INDEX_BITS 16 +#define FGRAPH_INDEX_MASK GENMASK(FGRAPH_INDEX_BITS - 1, 0) +#define FGRAPH_INDEX_SHIFT (FGRAPH_TYPE_SHIFT + FGRAPH_TYPE_BITS) + +/* + * For DATA type: + * FGRAPH_DATA (12-17) bits hold the size of data (in words) + * FGRAPH_INDEX (18-23) bits hold the index for which gops->idx the data is for + * + * Note: + * data_size == 0 means 1 word, and 31 (=2^5 - 1) means 32 words. + */ +#define FGRAPH_DATA_BITS 5 +#define FGRAPH_DATA_MASK GENMASK(FGRAPH_DATA_BITS - 1, 0) +#define FGRAPH_DATA_SHIFT (FGRAPH_TYPE_SHIFT + FGRAPH_TYPE_BITS) +#define FGRAPH_MAX_DATA_SIZE (sizeof(long) * (1 << FGRAPH_DATA_BITS)) + +#define FGRAPH_DATA_INDEX_BITS 4 +#define FGRAPH_DATA_INDEX_MASK GENMASK(FGRAPH_DATA_INDEX_BITS - 1, 0) +#define FGRAPH_DATA_INDEX_SHIFT (FGRAPH_DATA_SHIFT + FGRAPH_DATA_BITS) + +#define FGRAPH_MAX_INDEX \ + ((FGRAPH_INDEX_SIZE << FGRAPH_DATA_BITS) + FGRAPH_RET_INDEX) + +#define FGRAPH_ARRAY_SIZE FGRAPH_INDEX_BITS + +/* + * SHADOW_STACK_SIZE: The size in bytes of the entire shadow stack + * SHADOW_STACK_OFFSET: The size in long words of the shadow stack + * SHADOW_STACK_MAX_OFFSET: The max offset of the stack for a new frame to be added + */ +#define SHADOW_STACK_SIZE (4096) +#define SHADOW_STACK_OFFSET (SHADOW_STACK_SIZE / sizeof(long)) +/* Leave on a buffer at the end */ +#define SHADOW_STACK_MAX_OFFSET \ + (SHADOW_STACK_OFFSET - (FGRAPH_FRAME_OFFSET + 1 + FGRAPH_ARRAY_SIZE)) + +/* RET_STACK(): Return the frame from a given @offset from task @t */ +#define RET_STACK(t, offset) ((struct ftrace_ret_stack *)(&(t)->ret_stack[offset])) + +/* + * Each fgraph_ops has a reservered unsigned long at the end (top) of the + * ret_stack to store task specific state. + */ +#define SHADOW_STACK_TASK_VARS(ret_stack) \ + ((unsigned long *)(&(ret_stack)[SHADOW_STACK_OFFSET - FGRAPH_ARRAY_SIZE])) DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph); int ftrace_graph_active; +static struct kmem_cache *fgraph_stack_cachep; + +static struct fgraph_ops *fgraph_array[FGRAPH_ARRAY_SIZE]; +static unsigned long fgraph_array_bitmask; + +/* LRU index table for fgraph_array */ +static int fgraph_lru_table[FGRAPH_ARRAY_SIZE]; +static int fgraph_lru_next; +static int fgraph_lru_last; + +/* Initialize fgraph_lru_table with unused index */ +static void fgraph_lru_init(void) +{ + int i; + + for (i = 0; i < FGRAPH_ARRAY_SIZE; i++) + fgraph_lru_table[i] = i; +} + +/* Release the used index to the LRU table */ +static int fgraph_lru_release_index(int idx) +{ + if (idx < 0 || idx >= FGRAPH_ARRAY_SIZE || + WARN_ON_ONCE(fgraph_lru_table[fgraph_lru_last] != -1)) + return -1; + + fgraph_lru_table[fgraph_lru_last] = idx; + fgraph_lru_last = (fgraph_lru_last + 1) % FGRAPH_ARRAY_SIZE; + + clear_bit(idx, &fgraph_array_bitmask); + return 0; +} + +/* Allocate a new index from LRU table */ +static int fgraph_lru_alloc_index(void) +{ + int idx = fgraph_lru_table[fgraph_lru_next]; + + /* No id is available */ + if (idx == -1) + return -1; + + fgraph_lru_table[fgraph_lru_next] = -1; + fgraph_lru_next = (fgraph_lru_next + 1) % FGRAPH_ARRAY_SIZE; + + set_bit(idx, &fgraph_array_bitmask); + return idx; +} + +/* Get the offset to the fgraph frame from a ret_stack value */ +static inline int __get_offset(unsigned long val) +{ + return val & FGRAPH_FRAME_OFFSET_MASK; +} + +/* Get the type of word from a ret_stack value */ +static inline int __get_type(unsigned long val) +{ + return (val >> FGRAPH_TYPE_SHIFT) & FGRAPH_TYPE_MASK; +} + +/* Get the data_index for a DATA type ret_stack word */ +static inline int __get_data_index(unsigned long val) +{ + return (val >> FGRAPH_DATA_INDEX_SHIFT) & FGRAPH_DATA_INDEX_MASK; +} + +/* Get the data_size for a DATA type ret_stack word */ +static inline int __get_data_size(unsigned long val) +{ + return ((val >> FGRAPH_DATA_SHIFT) & FGRAPH_DATA_MASK) + 1; +} + +/* Get the word from the ret_stack at @offset */ +static inline unsigned long get_fgraph_entry(struct task_struct *t, int offset) +{ + return t->ret_stack[offset]; +} + +/* Get the FRAME_OFFSET from the word from the @offset on ret_stack */ +static inline int get_frame_offset(struct task_struct *t, int offset) +{ + return __get_offset(t->ret_stack[offset]); +} + +/* For BITMAP type: get the bitmask from the @offset at ret_stack */ +static inline unsigned long +get_bitmap_bits(struct task_struct *t, int offset) +{ + return (t->ret_stack[offset] >> FGRAPH_INDEX_SHIFT) & FGRAPH_INDEX_MASK; +} + +/* Write the bitmap to the ret_stack at @offset (does index, offset and bitmask) */ +static inline void +set_bitmap(struct task_struct *t, int offset, unsigned long bitmap) +{ + t->ret_stack[offset] = (bitmap << FGRAPH_INDEX_SHIFT) | + (FGRAPH_TYPE_BITMAP << FGRAPH_TYPE_SHIFT) | FGRAPH_FRAME_OFFSET; +} + +/* For DATA type: get the data saved under the ret_stack word at @offset */ +static inline void *get_data_type_data(struct task_struct *t, int offset) +{ + unsigned long val = t->ret_stack[offset]; + + if (__get_type(val) != FGRAPH_TYPE_DATA) + return NULL; + offset -= __get_data_size(val); + return (void *)&t->ret_stack[offset]; +} + +/* Create the ret_stack word for a DATA type */ +static inline unsigned long make_data_type_val(int idx, int size, int offset) +{ + return (idx << FGRAPH_DATA_INDEX_SHIFT) | + ((size - 1) << FGRAPH_DATA_SHIFT) | + (FGRAPH_TYPE_DATA << FGRAPH_TYPE_SHIFT) | offset; +} + +/* ftrace_graph_entry set to this to tell some archs to run function graph */ +static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops, + struct ftrace_regs *fregs) +{ + return 0; +} + +/* ftrace_graph_return set to this to tell some archs to run function graph */ +static void return_run(struct ftrace_graph_ret *trace, struct fgraph_ops *ops, + struct ftrace_regs *fregs) +{ +} + +static void ret_stack_set_task_var(struct task_struct *t, int idx, long val) +{ + unsigned long *gvals = SHADOW_STACK_TASK_VARS(t->ret_stack); + + gvals[idx] = val; +} + +static unsigned long * +ret_stack_get_task_var(struct task_struct *t, int idx) +{ + unsigned long *gvals = SHADOW_STACK_TASK_VARS(t->ret_stack); + + return &gvals[idx]; +} + +static void ret_stack_init_task_vars(unsigned long *ret_stack) +{ + unsigned long *gvals = SHADOW_STACK_TASK_VARS(ret_stack); + + memset(gvals, 0, sizeof(*gvals) * FGRAPH_ARRAY_SIZE); +} + +/** + * fgraph_reserve_data - Reserve storage on the task's ret_stack + * @idx: The index of fgraph_array + * @size_bytes: The size in bytes to reserve + * + * Reserves space of up to FGRAPH_MAX_DATA_SIZE bytes on the + * task's ret_stack shadow stack, for a given fgraph_ops during + * the entryfunc() call. If entryfunc() returns zero, the storage + * is discarded. An entryfunc() can only call this once per iteration. + * The fgraph_ops retfunc() can retrieve this stored data with + * fgraph_retrieve_data(). + * + * Returns: On success, a pointer to the data on the stack. + * Otherwise, NULL if there's not enough space left on the + * ret_stack for the data, or if fgraph_reserve_data() was called + * more than once for a single entryfunc() call. + */ +void *fgraph_reserve_data(int idx, int size_bytes) +{ + unsigned long val; + void *data; + int curr_ret_stack = current->curr_ret_stack; + int data_size; + + if (size_bytes > FGRAPH_MAX_DATA_SIZE) + return NULL; + + /* Convert the data size to number of longs. */ + data_size = (size_bytes + sizeof(long) - 1) >> (sizeof(long) == 4 ? 2 : 3); + + val = get_fgraph_entry(current, curr_ret_stack - 1); + data = ¤t->ret_stack[curr_ret_stack]; + + curr_ret_stack += data_size + 1; + if (unlikely(curr_ret_stack >= SHADOW_STACK_MAX_OFFSET)) + return NULL; + + val = make_data_type_val(idx, data_size, __get_offset(val) + data_size + 1); + + /* Set the last word to be reserved */ + current->ret_stack[curr_ret_stack - 1] = val; + + /* Make sure interrupts see this */ + barrier(); + current->curr_ret_stack = curr_ret_stack; + /* Again sync with interrupts, and reset reserve */ + current->ret_stack[curr_ret_stack - 1] = val; + + return data; +} + +/** + * fgraph_retrieve_data - Retrieve stored data from fgraph_reserve_data() + * @idx: the index of fgraph_array (fgraph_ops::idx) + * @size_bytes: pointer to retrieved data size. + * + * This is to be called by a fgraph_ops retfunc(), to retrieve data that + * was stored by the fgraph_ops entryfunc() on the function entry. + * That is, this will retrieve the data that was reserved on the + * entry of the function that corresponds to the exit of the function + * that the fgraph_ops retfunc() is called on. + * + * Returns: The stored data from fgraph_reserve_data() called by the + * matching entryfunc() for the retfunc() this is called from. + * Or NULL if there was nothing stored. + */ +void *fgraph_retrieve_data(int idx, int *size_bytes) +{ + return fgraph_retrieve_parent_data(idx, size_bytes, 0); +} + +/** + * fgraph_get_task_var - retrieve a task specific state variable + * @gops: The ftrace_ops that owns the task specific variable + * + * Every registered fgraph_ops has a task state variable + * reserved on the task's ret_stack. This function returns the + * address to that variable. + * + * Returns the address to the fgraph_ops @gops tasks specific + * unsigned long variable. + */ +unsigned long *fgraph_get_task_var(struct fgraph_ops *gops) +{ + return ret_stack_get_task_var(current, gops->idx); +} + +/* + * @offset: The offset into @t->ret_stack to find the ret_stack entry + * @frame_offset: Where to place the offset into @t->ret_stack of that entry + * + * Returns a pointer to the previous ret_stack below @offset or NULL + * when it reaches the bottom of the stack. + * + * Calling this with: + * + * offset = task->curr_ret_stack; + * do { + * ret_stack = get_ret_stack(task, offset, &offset); + * } while (ret_stack); + * + * Will iterate through all the ret_stack entries from curr_ret_stack + * down to the first one. + */ +static inline struct ftrace_ret_stack * +get_ret_stack(struct task_struct *t, int offset, int *frame_offset) +{ + int offs; + + BUILD_BUG_ON(FGRAPH_FRAME_SIZE % sizeof(long)); + + if (unlikely(offset <= 0)) + return NULL; + + offs = get_frame_offset(t, --offset); + if (WARN_ON_ONCE(offs <= 0 || offs > offset)) + return NULL; + + offset -= offs; + + *frame_offset = offset; + return RET_STACK(t, offset); +} + +/** + * fgraph_retrieve_parent_data - get data from a parent function + * @idx: The index into the fgraph_array (fgraph_ops::idx) + * @size_bytes: A pointer to retrieved data size + * @depth: The depth to find the parent (0 is the current function) + * + * This is similar to fgraph_retrieve_data() but can be used to retrieve + * data from a parent caller function. + * + * Return: a pointer to the specified parent data or NULL if not found + */ +void *fgraph_retrieve_parent_data(int idx, int *size_bytes, int depth) +{ + struct ftrace_ret_stack *ret_stack = NULL; + int offset = current->curr_ret_stack; + unsigned long val; + + if (offset <= 0) + return NULL; + + for (;;) { + int next_offset; + + ret_stack = get_ret_stack(current, offset, &next_offset); + if (!ret_stack || --depth < 0) + break; + offset = next_offset; + } + + if (!ret_stack) + return NULL; + + offset--; + + val = get_fgraph_entry(current, offset); + while (__get_type(val) == FGRAPH_TYPE_DATA) { + if (__get_data_index(val) == idx) + goto found; + offset -= __get_data_size(val) + 1; + val = get_fgraph_entry(current, offset); + } + return NULL; +found: + if (size_bytes) + *size_bytes = __get_data_size(val) * sizeof(long); + return get_data_type_data(current, offset); +} + /* Both enabled by default (can be cleared by function_graph tracer flags */ -static bool fgraph_sleep_time = true; +bool fgraph_sleep_time = true; #ifdef CONFIG_DYNAMIC_FTRACE /* @@ -51,6 +521,29 @@ int __weak ftrace_disable_ftrace_graph_caller(void) } #endif +int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) +{ + return 0; +} + +static void ftrace_graph_ret_stub(struct ftrace_graph_ret *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) +{ +} + +static struct fgraph_ops fgraph_stub = { + .entryfunc = ftrace_graph_entry_stub, + .retfunc = ftrace_graph_ret_stub, +}; + +static struct fgraph_ops *fgraph_direct_gops = &fgraph_stub; +DEFINE_STATIC_CALL(fgraph_func, ftrace_graph_entry_stub); +DEFINE_STATIC_CALL(fgraph_retfunc, ftrace_graph_ret_stub); +static DEFINE_STATIC_KEY_TRUE(fgraph_do_direct); + /** * ftrace_graph_stop - set to permanently disable function graph tracing * @@ -67,10 +560,12 @@ void ftrace_graph_stop(void) /* Add a function return address to the trace stack on thread info.*/ static int ftrace_push_return_trace(unsigned long ret, unsigned long func, - unsigned long frame_pointer, unsigned long *retp) + unsigned long frame_pointer, unsigned long *retp, + int fgraph_idx) { - unsigned long long calltime; - int index; + struct ftrace_ret_stack *ret_stack; + unsigned long val; + int offset; if (unlikely(ftrace_graph_is_dead())) return -EBUSY; @@ -78,32 +573,64 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, if (!current->ret_stack) return -EBUSY; + BUILD_BUG_ON(SHADOW_STACK_SIZE % sizeof(long)); + + /* Set val to "reserved" with the delta to the new fgraph frame */ + val = (FGRAPH_TYPE_RESERVED << FGRAPH_TYPE_SHIFT) | FGRAPH_FRAME_OFFSET; + /* * We must make sure the ret_stack is tested before we read * anything else. */ smp_rmb(); - /* The return trace stack is full */ - if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { + /* + * Check if there's room on the shadow stack to fit a fraph frame + * and a bitmap word. + */ + if (current->curr_ret_stack + FGRAPH_FRAME_OFFSET + 1 >= SHADOW_STACK_MAX_OFFSET) { atomic_inc(¤t->trace_overrun); return -EBUSY; } - calltime = trace_clock_local(); + offset = READ_ONCE(current->curr_ret_stack); + ret_stack = RET_STACK(current, offset); + offset += FGRAPH_FRAME_OFFSET; - index = ++current->curr_ret_stack; + /* ret offset = FGRAPH_FRAME_OFFSET ; type = reserved */ + current->ret_stack[offset] = val; + ret_stack->ret = ret; + /* + * The unwinders expect curr_ret_stack to point to either zero + * or an offset where to find the next ret_stack. Even though the + * ret stack might be bogus, we want to write the ret and the + * offset to find the ret_stack before we increment the stack point. + * If an interrupt comes in now before we increment the curr_ret_stack + * it may blow away what we wrote. But that's fine, because the + * offset will still be correct (even though the 'ret' won't be). + * What we worry about is the offset being correct after we increment + * the curr_ret_stack and before we update that offset, as if an + * interrupt comes in and does an unwind stack dump, it will need + * at least a correct offset! + */ + barrier(); + WRITE_ONCE(current->curr_ret_stack, offset + 1); + /* + * This next barrier is to ensure that an interrupt coming in + * will not corrupt what we are about to write. + */ barrier(); - current->ret_stack[index].ret = ret; - current->ret_stack[index].func = func; - current->ret_stack[index].calltime = calltime; + + /* Still keep it reserved even if an interrupt came in */ + current->ret_stack[offset] = val; + + ret_stack->ret = ret; + ret_stack->func = func; #ifdef HAVE_FUNCTION_GRAPH_FP_TEST - current->ret_stack[index].fp = frame_pointer; + ret_stack->fp = frame_pointer; #endif -#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR - current->ret_stack[index].retp = retp; -#endif - return 0; + ret_stack->retp = retp; + return offset; } /* @@ -120,55 +647,92 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, # define MCOUNT_INSN_SIZE 0 #endif -int function_graph_enter(unsigned long ret, unsigned long func, - unsigned long frame_pointer, unsigned long *retp) +/* If the caller does not use ftrace, call this function. */ +int function_graph_enter_regs(unsigned long ret, unsigned long func, + unsigned long frame_pointer, unsigned long *retp, + struct ftrace_regs *fregs) { struct ftrace_graph_ent trace; + unsigned long bitmap = 0; + int offset; + int bit; + int i; -#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS - /* - * Skip graph tracing if the return location is served by direct trampoline, - * since call sequence and return addresses are unpredictable anyway. - * Ex: BPF trampoline may call original function and may skip frame - * depending on type of BPF programs attached. - */ - if (ftrace_direct_func_count && - ftrace_find_rec_direct(ret - MCOUNT_INSN_SIZE)) + bit = ftrace_test_recursion_trylock(func, ret); + if (bit < 0) return -EBUSY; -#endif + trace.func = func; trace.depth = ++current->curr_ret_depth; - if (ftrace_push_return_trace(ret, func, frame_pointer, retp)) + offset = ftrace_push_return_trace(ret, func, frame_pointer, retp, 0); + if (offset < 0) goto out; - /* Only trace if the calling function expects to */ - if (!ftrace_graph_entry(&trace)) +#ifdef CONFIG_HAVE_STATIC_CALL + if (static_branch_likely(&fgraph_do_direct)) { + int save_curr_ret_stack = current->curr_ret_stack; + + if (static_call(fgraph_func)(&trace, fgraph_direct_gops, fregs)) + bitmap |= BIT(fgraph_direct_gops->idx); + else + /* Clear out any saved storage */ + current->curr_ret_stack = save_curr_ret_stack; + } else +#endif + { + for_each_set_bit(i, &fgraph_array_bitmask, + sizeof(fgraph_array_bitmask) * BITS_PER_BYTE) { + struct fgraph_ops *gops = READ_ONCE(fgraph_array[i]); + int save_curr_ret_stack; + + if (gops == &fgraph_stub) + continue; + + save_curr_ret_stack = current->curr_ret_stack; + if (ftrace_ops_test(&gops->ops, func, NULL) && + gops->entryfunc(&trace, gops, fregs)) + bitmap |= BIT(i); + else + /* Clear out any saved storage */ + current->curr_ret_stack = save_curr_ret_stack; + } + } + + if (!bitmap) goto out_ret; + /* + * Since this function uses fgraph_idx = 0 as a tail-call checking + * flag, set that bit always. + */ + set_bitmap(current, offset, bitmap | BIT(0)); + ftrace_test_recursion_unlock(bit); return 0; out_ret: - current->curr_ret_stack--; + current->curr_ret_stack -= FGRAPH_FRAME_OFFSET + 1; out: current->curr_ret_depth--; + ftrace_test_recursion_unlock(bit); return -EBUSY; } /* Retrieve a function return address to the trace stack on thread info.*/ -static void +static struct ftrace_ret_stack * ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, - unsigned long frame_pointer) + unsigned long frame_pointer, int *offset) { - int index; + struct ftrace_ret_stack *ret_stack; - index = current->curr_ret_stack; + ret_stack = get_ret_stack(current, current->curr_ret_stack, offset); - if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) { + if (unlikely(!ret_stack)) { ftrace_graph_stop(); - WARN_ON(1); + WARN(1, "Bad function graph ret_stack pointer: %d", + current->curr_ret_stack); /* Might as well panic, otherwise we have no where to go */ *ret = (unsigned long)panic; - return; + return NULL; } #ifdef HAVE_FUNCTION_GRAPH_FP_TEST @@ -186,30 +750,32 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, * Note, -mfentry does not use frame pointers, and this test * is not needed if CC_USING_FENTRY is set. */ - if (unlikely(current->ret_stack[index].fp != frame_pointer)) { + if (unlikely(ret_stack->fp != frame_pointer)) { ftrace_graph_stop(); WARN(1, "Bad frame pointer: expected %lx, received %lx\n" " from func %ps return to %lx\n", - current->ret_stack[index].fp, + ret_stack->fp, frame_pointer, - (void *)current->ret_stack[index].func, - current->ret_stack[index].ret); + (void *)ret_stack->func, + ret_stack->ret); *ret = (unsigned long)panic; - return; + return NULL; } #endif - *ret = current->ret_stack[index].ret; - trace->func = current->ret_stack[index].func; - trace->calltime = current->ret_stack[index].calltime; + *offset += FGRAPH_FRAME_OFFSET; + *ret = ret_stack->ret; + trace->func = ret_stack->func; trace->overrun = atomic_read(¤t->trace_overrun); - trace->depth = current->curr_ret_depth--; + trace->depth = current->curr_ret_depth; /* * We still want to trace interrupts coming in if * max_depth is set to 1. Make sure the decrement is * seen before ftrace_graph_return. */ barrier(); + + return ret_stack; } /* @@ -237,52 +803,76 @@ static struct notifier_block ftrace_suspend_notifier = { .notifier_call = ftrace_suspend_notifier_call, }; -/* fgraph_ret_regs is not defined without CONFIG_FUNCTION_GRAPH_RETVAL */ -struct fgraph_ret_regs; - /* * Send the trace to the ring-buffer. * @return the original return address. */ -static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs, - unsigned long frame_pointer) +static inline unsigned long +__ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointer) { + struct ftrace_ret_stack *ret_stack; struct ftrace_graph_ret trace; + unsigned long bitmap; unsigned long ret; + int offset; + int i; + + ret_stack = ftrace_pop_return_trace(&trace, &ret, frame_pointer, &offset); + + if (unlikely(!ret_stack)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic. What else to do? */ + return (unsigned long)panic; + } + + if (fregs) + ftrace_regs_set_instruction_pointer(fregs, ret); - ftrace_pop_return_trace(&trace, &ret, frame_pointer); #ifdef CONFIG_FUNCTION_GRAPH_RETVAL - trace.retval = fgraph_ret_regs_return_value(ret_regs); + trace.retval = ftrace_regs_get_return_value(fregs); #endif - trace.rettime = trace_clock_local(); - ftrace_graph_return(&trace); + + bitmap = get_bitmap_bits(current, offset); + +#ifdef CONFIG_HAVE_STATIC_CALL + if (static_branch_likely(&fgraph_do_direct)) { + if (test_bit(fgraph_direct_gops->idx, &bitmap)) + static_call(fgraph_retfunc)(&trace, fgraph_direct_gops, fregs); + } else +#endif + { + for_each_set_bit(i, &bitmap, sizeof(bitmap) * BITS_PER_BYTE) { + struct fgraph_ops *gops = READ_ONCE(fgraph_array[i]); + + if (gops == &fgraph_stub) + continue; + + gops->retfunc(&trace, gops, fregs); + } + } + /* * The ftrace_graph_return() may still access the current * ret_stack structure, we need to make sure the update of * curr_ret_stack is after that. */ barrier(); - current->curr_ret_stack--; - - if (unlikely(!ret)) { - ftrace_graph_stop(); - WARN_ON(1); - /* Might as well panic. What else to do? */ - ret = (unsigned long)panic; - } + current->curr_ret_stack = offset - FGRAPH_FRAME_OFFSET; + current->curr_ret_depth--; return ret; } /* - * After all architecures have selected HAVE_FUNCTION_GRAPH_RETVAL, we can - * leave only ftrace_return_to_handler(ret_regs). + * After all architecures have selected HAVE_FUNCTION_GRAPH_FREGS, we can + * leave only ftrace_return_to_handler(fregs). */ -#ifdef CONFIG_HAVE_FUNCTION_GRAPH_RETVAL -unsigned long ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs) +#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FREGS +unsigned long ftrace_return_to_handler(struct ftrace_regs *fregs) { - return __ftrace_return_to_handler(ret_regs, - fgraph_ret_regs_frame_pointer(ret_regs)); + return __ftrace_return_to_handler(fregs, + ftrace_regs_get_frame_pointer(fregs)); } #else unsigned long ftrace_return_to_handler(unsigned long frame_pointer) @@ -293,7 +883,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) /** * ftrace_graph_get_ret_stack - return the entry of the shadow stack - * @task: The task to read the shadow stack from + * @task: The task to read the shadow stack from. * @idx: Index down the shadow stack * * Return the ret_struct on the shadow stack of the @task at the @@ -305,115 +895,151 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) struct ftrace_ret_stack * ftrace_graph_get_ret_stack(struct task_struct *task, int idx) { - idx = task->curr_ret_stack - idx; + struct ftrace_ret_stack *ret_stack = NULL; + int offset = task->curr_ret_stack; - if (idx >= 0 && idx <= task->curr_ret_stack) - return &task->ret_stack[idx]; + if (offset < 0) + return NULL; - return NULL; + do { + ret_stack = get_ret_stack(task, offset, &offset); + } while (ret_stack && --idx >= 0); + + return ret_stack; } /** - * ftrace_graph_ret_addr - convert a potentially modified stack return address - * to its original value + * ftrace_graph_top_ret_addr - return the top return address in the shadow stack + * @task: The task to read the shadow stack from. + * + * Return the first return address on the shadow stack of the @task, which is + * not the fgraph's return_to_handler. + */ +unsigned long ftrace_graph_top_ret_addr(struct task_struct *task) +{ + unsigned long return_handler = (unsigned long)dereference_kernel_function_descriptor(return_to_handler); + struct ftrace_ret_stack *ret_stack = NULL; + int offset = task->curr_ret_stack; + + if (offset < 0) + return 0; + + do { + ret_stack = get_ret_stack(task, offset, &offset); + } while (ret_stack && ret_stack->ret == return_handler); + + return ret_stack ? ret_stack->ret : 0; +} + +/** + * ftrace_graph_ret_addr - return the original value of the return address + * @task: The task the unwinder is being executed on + * @idx: An initialized pointer to the next stack index to use + * @ret: The current return address (likely pointing to return_handler) + * @retp: The address on the stack of the current return location * * This function can be called by stack unwinding code to convert a found stack - * return address ('ret') to its original value, in case the function graph + * return address (@ret) to its original value, in case the function graph * tracer has modified it to be 'return_to_handler'. If the address hasn't - * been modified, the unchanged value of 'ret' is returned. + * been modified, the unchanged value of @ret is returned. * - * 'idx' is a state variable which should be initialized by the caller to zero - * before the first call. + * @idx holds the last index used to know where to start from. It should be + * initialized to zero for the first iteration as that will mean to start + * at the top of the shadow stack. If the location is found, this pointer + * will be assigned that location so that if called again, it will continue + * where it left off. * - * 'retp' is a pointer to the return address on the stack. It's ignored if - * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined. + * @retp is a pointer to the return address on the stack. */ -#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, unsigned long ret, unsigned long *retp) { - int index = task->curr_ret_stack; + struct ftrace_ret_stack *ret_stack; + unsigned long return_handler = (unsigned long)dereference_kernel_function_descriptor(return_to_handler); int i; - if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler)) + if (ret != return_handler) return ret; - if (index < 0) + if (!idx) return ret; - for (i = 0; i <= index; i++) - if (task->ret_stack[i].retp == retp) - return task->ret_stack[i].ret; + i = *idx ? : task->curr_ret_stack; + while (i > 0) { + ret_stack = get_ret_stack(task, i, &i); + if (!ret_stack) + break; + /* + * For the tail-call, there would be 2 or more ftrace_ret_stacks on + * the ret_stack, which records "return_to_handler" as the return + * address except for the last one. + * But on the real stack, there should be 1 entry because tail-call + * reuses the return address on the stack and jump to the next function. + * Thus we will continue to find real return address. + */ + if (ret_stack->retp == retp && + ret_stack->ret != return_handler) { + *idx = i; + return ret_stack->ret; + } + } return ret; } -#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ -unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, - unsigned long ret, unsigned long *retp) -{ - int task_idx; - - if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler)) - return ret; - - task_idx = task->curr_ret_stack; - - if (!task->ret_stack || task_idx < *idx) - return ret; - - task_idx -= *idx; - (*idx)++; - - return task->ret_stack[task_idx].ret; -} -#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ static struct ftrace_ops graph_ops = { .func = ftrace_graph_func, - .flags = FTRACE_OPS_FL_INITIALIZED | - FTRACE_OPS_FL_PID | - FTRACE_OPS_GRAPH_STUB, + .flags = FTRACE_OPS_GRAPH_STUB, #ifdef FTRACE_GRAPH_TRAMP_ADDR .trampoline = FTRACE_GRAPH_TRAMP_ADDR, /* trampoline_size is only needed for dynamically allocated tramps */ #endif - ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) }; -void ftrace_graph_sleep_time_control(bool enable) +void fgraph_init_ops(struct ftrace_ops *dst_ops, + struct ftrace_ops *src_ops) { - fgraph_sleep_time = enable; + dst_ops->flags = FTRACE_OPS_FL_PID | FTRACE_OPS_GRAPH_STUB; + +#ifdef CONFIG_DYNAMIC_FTRACE + if (src_ops) { + dst_ops->func_hash = &src_ops->local_hash; + mutex_init(&dst_ops->local_hash.regex_lock); + INIT_LIST_HEAD(&dst_ops->subop_list); + dst_ops->flags |= FTRACE_OPS_FL_INITIALIZED; + } +#endif } -int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) +void ftrace_graph_sleep_time_control(bool enable) { - return 0; + fgraph_sleep_time = enable; } /* * Simply points to ftrace_stub, but with the proper protocol. * Defined by the linker script in linux/vmlinux.lds.h */ -extern void ftrace_stub_graph(struct ftrace_graph_ret *); +void ftrace_stub_graph(struct ftrace_graph_ret *trace, struct fgraph_ops *gops, + struct ftrace_regs *fregs); /* The callbacks that hook a function */ trace_func_graph_ret_t ftrace_graph_return = ftrace_stub_graph; trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; -static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ -static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) +static int alloc_retstack_tasklist(unsigned long **ret_stack_list) { int i; int ret = 0; int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE; struct task_struct *g, *t; + if (WARN_ON_ONCE(!fgraph_stack_cachep)) + return -ENOMEM; + for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) { - ret_stack_list[i] = - kmalloc_array(FTRACE_RETFUNC_DEPTH, - sizeof(struct ftrace_ret_stack), - GFP_KERNEL); + ret_stack_list[i] = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL); if (!ret_stack_list[i]) { start = 0; end = i; @@ -431,9 +1057,10 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) if (t->ret_stack == NULL) { atomic_set(&t->trace_overrun, 0); - t->curr_ret_stack = -1; + ret_stack_init_task_vars(ret_stack_list[start]); + t->curr_ret_stack = 0; t->curr_ret_depth = -1; - /* Make sure the tasks see the -1 first: */ + /* Make sure the tasks see the 0 first: */ smp_wmb(); t->ret_stack = ret_stack_list[start++]; } @@ -443,7 +1070,7 @@ unlock: rcu_read_unlock(); free: for (i = start; i < end; i++) - kfree(ret_stack_list[i]); + kmem_cache_free(fgraph_stack_cachep, ret_stack_list[i]); return ret; } @@ -454,7 +1081,6 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt, unsigned int prev_state) { unsigned long long timestamp; - int index; /* * Does the user want to count the time a function was asleep. @@ -471,63 +1097,19 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt, if (!next->ftrace_timestamp) return; - /* - * Update all the counters in next to make up for the - * time next was sleeping. - */ - timestamp -= next->ftrace_timestamp; - - for (index = next->curr_ret_stack; index >= 0; index--) - next->ret_stack[index].calltime += timestamp; -} - -static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) -{ - if (!ftrace_ops_test(&global_ops, trace->func, NULL)) - return 0; - return __ftrace_graph_entry(trace); -} - -/* - * The function graph tracer should only trace the functions defined - * by set_ftrace_filter and set_ftrace_notrace. If another function - * tracer ops is registered, the graph tracer requires testing the - * function against the global ops, and not just trace any function - * that any ftrace_ops registered. - */ -void update_function_graph_func(void) -{ - struct ftrace_ops *op; - bool do_test = false; - - /* - * The graph and global ops share the same set of functions - * to test. If any other ops is on the list, then - * the graph tracing needs to test if its the function - * it should call. - */ - do_for_each_ftrace_op(op, ftrace_ops_list) { - if (op != &global_ops && op != &graph_ops && - op != &ftrace_list_end) { - do_test = true; - /* in double loop, break out with goto */ - goto out; - } - } while_for_each_ftrace_op(op); - out: - if (do_test) - ftrace_graph_entry = ftrace_graph_entry_test; - else - ftrace_graph_entry = __ftrace_graph_entry; + next->ftrace_sleeptime += timestamp - next->ftrace_timestamp; } -static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack); +static DEFINE_PER_CPU(unsigned long *, idle_ret_stack); static void -graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) +graph_init_task(struct task_struct *t, unsigned long *ret_stack) { atomic_set(&t->trace_overrun, 0); + ret_stack_init_task_vars(ret_stack); t->ftrace_timestamp = 0; + t->curr_ret_stack = 0; + t->curr_ret_depth = -1; /* make curr_ret_stack visible before we add the ret_stack */ smp_wmb(); t->ret_stack = ret_stack; @@ -539,7 +1121,7 @@ graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) */ void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) { - t->curr_ret_stack = -1; + t->curr_ret_stack = 0; t->curr_ret_depth = -1; /* * The idle task has no parent, it either has its own @@ -549,14 +1131,14 @@ void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu)); if (ftrace_graph_active) { - struct ftrace_ret_stack *ret_stack; + unsigned long *ret_stack; + + if (WARN_ON_ONCE(!fgraph_stack_cachep)) + return; ret_stack = per_cpu(idle_ret_stack, cpu); if (!ret_stack) { - ret_stack = - kmalloc_array(FTRACE_RETFUNC_DEPTH, - sizeof(struct ftrace_ret_stack), - GFP_KERNEL); + ret_stack = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL); if (!ret_stack) return; per_cpu(idle_ret_stack, cpu) = ret_stack; @@ -570,15 +1152,16 @@ void ftrace_graph_init_task(struct task_struct *t) { /* Make sure we do not use the parent ret_stack */ t->ret_stack = NULL; - t->curr_ret_stack = -1; + t->curr_ret_stack = 0; t->curr_ret_depth = -1; if (ftrace_graph_active) { - struct ftrace_ret_stack *ret_stack; + unsigned long *ret_stack; - ret_stack = kmalloc_array(FTRACE_RETFUNC_DEPTH, - sizeof(struct ftrace_ret_stack), - GFP_KERNEL); + if (WARN_ON_ONCE(!fgraph_stack_cachep)) + return; + + ret_stack = kmem_cache_alloc(fgraph_stack_cachep, GFP_KERNEL); if (!ret_stack) return; graph_init_task(t, ret_stack); @@ -587,24 +1170,67 @@ void ftrace_graph_init_task(struct task_struct *t) void ftrace_graph_exit_task(struct task_struct *t) { - struct ftrace_ret_stack *ret_stack = t->ret_stack; + unsigned long *ret_stack = t->ret_stack; t->ret_stack = NULL; /* NULL must become visible to IRQs before we free it: */ barrier(); - kfree(ret_stack); + if (ret_stack) { + if (WARN_ON_ONCE(!fgraph_stack_cachep)) + return; + kmem_cache_free(fgraph_stack_cachep, ret_stack); + } } +#ifdef CONFIG_DYNAMIC_FTRACE +static int fgraph_pid_func(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) +{ + struct trace_array *tr = gops->ops.private; + int pid; + + if (tr) { + pid = this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid); + if (pid == FTRACE_PID_IGNORE) + return 0; + if (pid != FTRACE_PID_TRACE && + pid != current->pid) + return 0; + } + + return gops->saved_func(trace, gops, fregs); +} + +void fgraph_update_pid_func(void) +{ + struct fgraph_ops *gops; + struct ftrace_ops *op; + + if (!(graph_ops.flags & FTRACE_OPS_FL_INITIALIZED)) + return; + + list_for_each_entry(op, &graph_ops.subop_list, list) { + if (op->flags & FTRACE_OPS_FL_PID) { + gops = container_of(op, struct fgraph_ops, ops); + gops->entryfunc = ftrace_pids_enabled(op) ? + fgraph_pid_func : gops->saved_func; + if (ftrace_graph_active == 1) + static_call_update(fgraph_func, gops->entryfunc); + } + } +} +#endif + /* Allocate a return stack for each task */ static int start_graph_tracing(void) { - struct ftrace_ret_stack **ret_stack_list; + unsigned long **ret_stack_list; int ret, cpu; - ret_stack_list = kmalloc_array(FTRACE_RETSTACK_ALLOC_SIZE, - sizeof(struct ftrace_ret_stack *), - GFP_KERNEL); + ret_stack_list = kcalloc(FTRACE_RETSTACK_ALLOC_SIZE, + sizeof(*ret_stack_list), GFP_KERNEL); if (!ret_stack_list) return -ENOMEM; @@ -630,60 +1256,180 @@ static int start_graph_tracing(void) return ret; } +static void init_task_vars(int idx) +{ + struct task_struct *g, *t; + int cpu; + + for_each_online_cpu(cpu) { + if (idle_task(cpu)->ret_stack) + ret_stack_set_task_var(idle_task(cpu), idx, 0); + } + + read_lock(&tasklist_lock); + for_each_process_thread(g, t) { + if (t->ret_stack) + ret_stack_set_task_var(t, idx, 0); + } + read_unlock(&tasklist_lock); +} + +static void ftrace_graph_enable_direct(bool enable_branch, struct fgraph_ops *gops) +{ + trace_func_graph_ent_t func = NULL; + trace_func_graph_ret_t retfunc = NULL; + int i; + + if (gops) { + func = gops->entryfunc; + retfunc = gops->retfunc; + fgraph_direct_gops = gops; + } else { + for_each_set_bit(i, &fgraph_array_bitmask, + sizeof(fgraph_array_bitmask) * BITS_PER_BYTE) { + func = fgraph_array[i]->entryfunc; + retfunc = fgraph_array[i]->retfunc; + fgraph_direct_gops = fgraph_array[i]; + } + } + if (WARN_ON_ONCE(!func)) + return; + + static_call_update(fgraph_func, func); + static_call_update(fgraph_retfunc, retfunc); + if (enable_branch) + static_branch_disable(&fgraph_do_direct); +} + +static void ftrace_graph_disable_direct(bool disable_branch) +{ + if (disable_branch) + static_branch_disable(&fgraph_do_direct); + static_call_update(fgraph_func, ftrace_graph_entry_stub); + static_call_update(fgraph_retfunc, ftrace_graph_ret_stub); + fgraph_direct_gops = &fgraph_stub; +} + +/* The cpu_boot init_task->ret_stack will never be freed */ +static int fgraph_cpu_init(unsigned int cpu) +{ + if (!idle_task(cpu)->ret_stack) + ftrace_graph_init_idle_task(idle_task(cpu), cpu); + return 0; +} + int register_ftrace_graph(struct fgraph_ops *gops) { + static bool fgraph_initialized; + int command = 0; int ret = 0; + int i = -1; - mutex_lock(&ftrace_lock); + guard(mutex)(&ftrace_lock); - /* we currently allow only one tracer registered at a time */ - if (ftrace_graph_active) { - ret = -EBUSY; - goto out; + if (!fgraph_stack_cachep) { + fgraph_stack_cachep = kmem_cache_create("fgraph_stack", + SHADOW_STACK_SIZE, + SHADOW_STACK_SIZE, 0, NULL); + if (!fgraph_stack_cachep) + return -ENOMEM; } - register_pm_notifier(&ftrace_suspend_notifier); + if (!fgraph_initialized) { + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "fgraph:online", + fgraph_cpu_init, NULL); + if (ret < 0) { + pr_warn("fgraph: Error to init cpu hotplug support\n"); + return ret; + } + fgraph_initialized = true; + ret = 0; + } - ftrace_graph_active++; - ret = start_graph_tracing(); - if (ret) { - ftrace_graph_active--; - goto out; + if (!fgraph_array[0]) { + /* The array must always have real data on it */ + for (i = 0; i < FGRAPH_ARRAY_SIZE; i++) + fgraph_array[i] = &fgraph_stub; + fgraph_lru_init(); } - ftrace_graph_return = gops->retfunc; + i = fgraph_lru_alloc_index(); + if (i < 0 || WARN_ON_ONCE(fgraph_array[i] != &fgraph_stub)) + return -ENOSPC; + gops->idx = i; - /* - * Update the indirect function to the entryfunc, and the - * function that gets called to the entry_test first. Then - * call the update fgraph entry function to determine if - * the entryfunc should be called directly or not. - */ - __ftrace_graph_entry = gops->entryfunc; - ftrace_graph_entry = ftrace_graph_entry_test; - update_function_graph_func(); + ftrace_graph_active++; + + if (ftrace_graph_active == 2) + ftrace_graph_disable_direct(true); + + if (ftrace_graph_active == 1) { + ftrace_graph_enable_direct(false, gops); + register_pm_notifier(&ftrace_suspend_notifier); + ret = start_graph_tracing(); + if (ret) + goto error; + /* + * Some archs just test to see if these are not + * the default function + */ + ftrace_graph_return = return_run; + ftrace_graph_entry = entry_run; + command = FTRACE_START_FUNC_RET; + } else { + init_task_vars(gops->idx); + } + /* Always save the function, and reset at unregistering */ + gops->saved_func = gops->entryfunc; - ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET); -out: - mutex_unlock(&ftrace_lock); + ret = ftrace_startup_subops(&graph_ops, &gops->ops, command); + if (!ret) + fgraph_array[i] = gops; + +error: + if (ret) { + ftrace_graph_active--; + gops->saved_func = NULL; + fgraph_lru_release_index(i); + } return ret; } void unregister_ftrace_graph(struct fgraph_ops *gops) { - mutex_lock(&ftrace_lock); + int command = 0; + + guard(mutex)(&ftrace_lock); if (unlikely(!ftrace_graph_active)) - goto out; + return; + + if (unlikely(gops->idx < 0 || gops->idx >= FGRAPH_ARRAY_SIZE || + fgraph_array[gops->idx] != gops)) + return; + + if (fgraph_lru_release_index(gops->idx) < 0) + return; + + fgraph_array[gops->idx] = &fgraph_stub; ftrace_graph_active--; - ftrace_graph_return = ftrace_stub_graph; - ftrace_graph_entry = ftrace_graph_entry_stub; - __ftrace_graph_entry = ftrace_graph_entry_stub; - ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); - unregister_pm_notifier(&ftrace_suspend_notifier); - unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); - out: - mutex_unlock(&ftrace_lock); + if (!ftrace_graph_active) + command = FTRACE_STOP_FUNC_RET; + + ftrace_shutdown_subops(&graph_ops, &gops->ops, command); + + if (ftrace_graph_active == 1) + ftrace_graph_enable_direct(true, NULL); + else if (!ftrace_graph_active) + ftrace_graph_disable_direct(false); + + if (!ftrace_graph_active) { + ftrace_graph_return = ftrace_stub_graph; + ftrace_graph_entry = ftrace_graph_entry_stub; + unregister_pm_notifier(&ftrace_suspend_notifier); + unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); + } + gops->saved_func = NULL; } diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 9ff018245840..33082c4e8154 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -8,98 +8,224 @@ #include <linux/fprobe.h> #include <linux/kallsyms.h> #include <linux/kprobes.h> -#include <linux/rethook.h> +#include <linux/list.h> +#include <linux/mutex.h> #include <linux/slab.h> #include <linux/sort.h> +#include <asm/fprobe.h> + #include "trace.h" -struct fprobe_rethook_node { - struct rethook_node node; - unsigned long entry_ip; - unsigned long entry_parent_ip; - char data[]; -}; +#define FPROBE_IP_HASH_BITS 8 +#define FPROBE_IP_TABLE_SIZE (1 << FPROBE_IP_HASH_BITS) -static inline void __fprobe_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct ftrace_regs *fregs) -{ - struct fprobe_rethook_node *fpr; - struct rethook_node *rh = NULL; - struct fprobe *fp; - void *entry_data = NULL; - int ret = 0; +#define FPROBE_HASH_BITS 6 +#define FPROBE_TABLE_SIZE (1 << FPROBE_HASH_BITS) - fp = container_of(ops, struct fprobe, ops); +#define SIZE_IN_LONG(x) ((x + sizeof(long) - 1) >> (sizeof(long) == 8 ? 3 : 2)) - if (fp->exit_handler) { - rh = rethook_try_get(fp->rethook); - if (!rh) { - fp->nmissed++; - return; - } - fpr = container_of(rh, struct fprobe_rethook_node, node); - fpr->entry_ip = ip; - fpr->entry_parent_ip = parent_ip; - if (fp->entry_data_size) - entry_data = fpr->data; +/* + * fprobe_table: hold 'fprobe_hlist::hlist' for checking the fprobe still + * exists. The key is the address of fprobe instance. + * fprobe_ip_table: hold 'fprobe_hlist::array[*]' for searching the fprobe + * instance related to the funciton address. The key is the ftrace IP + * address. + * + * When unregistering the fprobe, fprobe_hlist::fp and fprobe_hlist::array[*].fp + * are set NULL and delete those from both hash tables (by hlist_del_rcu). + * After an RCU grace period, the fprobe_hlist itself will be released. + * + * fprobe_table and fprobe_ip_table can be accessed from either + * - Normal hlist traversal and RCU add/del under 'fprobe_mutex' is held. + * - RCU hlist traversal under disabling preempt + */ +static struct hlist_head fprobe_table[FPROBE_TABLE_SIZE]; +static struct hlist_head fprobe_ip_table[FPROBE_IP_TABLE_SIZE]; +static DEFINE_MUTEX(fprobe_mutex); + +/* + * Find first fprobe in the hlist. It will be iterated twice in the entry + * probe, once for correcting the total required size, the second time is + * calling back the user handlers. + * Thus the hlist in the fprobe_table must be sorted and new probe needs to + * be added *before* the first fprobe. + */ +static struct fprobe_hlist_node *find_first_fprobe_node(unsigned long ip) +{ + struct fprobe_hlist_node *node; + struct hlist_head *head; + + head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)]; + hlist_for_each_entry_rcu(node, head, hlist, + lockdep_is_held(&fprobe_mutex)) { + if (node->addr == ip) + return node; } + return NULL; +} +NOKPROBE_SYMBOL(find_first_fprobe_node); + +/* Node insertion and deletion requires the fprobe_mutex */ +static void insert_fprobe_node(struct fprobe_hlist_node *node) +{ + unsigned long ip = node->addr; + struct fprobe_hlist_node *next; + struct hlist_head *head; - if (fp->entry_handler) - ret = fp->entry_handler(fp, ip, parent_ip, ftrace_get_regs(fregs), entry_data); + lockdep_assert_held(&fprobe_mutex); - /* If entry_handler returns !0, nmissed is not counted. */ - if (rh) { - if (ret) - rethook_recycle(rh); - else - rethook_hook(rh, ftrace_get_regs(fregs), true); + next = find_first_fprobe_node(ip); + if (next) { + hlist_add_before_rcu(&node->hlist, &next->hlist); + return; } + head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)]; + hlist_add_head_rcu(&node->hlist, head); } -static void fprobe_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct ftrace_regs *fregs) +/* Return true if there are synonims */ +static bool delete_fprobe_node(struct fprobe_hlist_node *node) { - struct fprobe *fp; - int bit; + lockdep_assert_held(&fprobe_mutex); - fp = container_of(ops, struct fprobe, ops); - if (fprobe_disabled(fp)) - return; + WRITE_ONCE(node->fp, NULL); + hlist_del_rcu(&node->hlist); + return !!find_first_fprobe_node(node->addr); +} - /* recursion detection has to go before any traceable function and - * all functions before this point should be marked as notrace - */ - bit = ftrace_test_recursion_trylock(ip, parent_ip); - if (bit < 0) { - fp->nmissed++; - return; +/* Check existence of the fprobe */ +static bool is_fprobe_still_exist(struct fprobe *fp) +{ + struct hlist_head *head; + struct fprobe_hlist *fph; + + head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)]; + hlist_for_each_entry_rcu(fph, head, hlist, + lockdep_is_held(&fprobe_mutex)) { + if (fph->fp == fp) + return true; } - __fprobe_handler(ip, parent_ip, ops, fregs); - ftrace_test_recursion_unlock(bit); + return false; +} +NOKPROBE_SYMBOL(is_fprobe_still_exist); + +static int add_fprobe_hash(struct fprobe *fp) +{ + struct fprobe_hlist *fph = fp->hlist_array; + struct hlist_head *head; + + lockdep_assert_held(&fprobe_mutex); + + if (WARN_ON_ONCE(!fph)) + return -EINVAL; + + if (is_fprobe_still_exist(fp)) + return -EEXIST; + + head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)]; + hlist_add_head_rcu(&fp->hlist_array->hlist, head); + return 0; +} + +static int del_fprobe_hash(struct fprobe *fp) +{ + struct fprobe_hlist *fph = fp->hlist_array; + + lockdep_assert_held(&fprobe_mutex); + + if (WARN_ON_ONCE(!fph)) + return -EINVAL; + + if (!is_fprobe_still_exist(fp)) + return -ENOENT; + + fph->fp = NULL; + hlist_del_rcu(&fph->hlist); + return 0; +} + +#ifdef ARCH_DEFINE_ENCODE_FPROBE_HEADER + +/* The arch should encode fprobe_header info into one unsigned long */ +#define FPROBE_HEADER_SIZE_IN_LONG 1 + +static inline bool write_fprobe_header(unsigned long *stack, + struct fprobe *fp, unsigned int size_words) +{ + if (WARN_ON_ONCE(size_words > MAX_FPROBE_DATA_SIZE_WORD || + !arch_fprobe_header_encodable(fp))) + return false; + *stack = arch_encode_fprobe_header(fp, size_words); + return true; } -NOKPROBE_SYMBOL(fprobe_handler); -static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct ftrace_regs *fregs) +static inline void read_fprobe_header(unsigned long *stack, + struct fprobe **fp, unsigned int *size_words) { + *fp = arch_decode_fprobe_header_fp(*stack); + *size_words = arch_decode_fprobe_header_size(*stack); +} + +#else + +/* Generic fprobe_header */ +struct __fprobe_header { struct fprobe *fp; - int bit; + unsigned long size_words; +} __packed; - fp = container_of(ops, struct fprobe, ops); - if (fprobe_disabled(fp)) - return; +#define FPROBE_HEADER_SIZE_IN_LONG SIZE_IN_LONG(sizeof(struct __fprobe_header)) - /* recursion detection has to go before any traceable function and - * all functions called before this point should be marked as notrace - */ - bit = ftrace_test_recursion_trylock(ip, parent_ip); - if (bit < 0) { - fp->nmissed++; - return; - } +static inline bool write_fprobe_header(unsigned long *stack, + struct fprobe *fp, unsigned int size_words) +{ + struct __fprobe_header *fph = (struct __fprobe_header *)stack; + + if (WARN_ON_ONCE(size_words > MAX_FPROBE_DATA_SIZE_WORD)) + return false; + + fph->fp = fp; + fph->size_words = size_words; + return true; +} + +static inline void read_fprobe_header(unsigned long *stack, + struct fprobe **fp, unsigned int *size_words) +{ + struct __fprobe_header *fph = (struct __fprobe_header *)stack; + + *fp = fph->fp; + *size_words = fph->size_words; +} + +#endif + +/* + * fprobe shadow stack management: + * Since fprobe shares a single fgraph_ops, it needs to share the stack entry + * among the probes on the same function exit. Note that a new probe can be + * registered before a target function is returning, we can not use the hash + * table to find the corresponding probes. Thus the probe address is stored on + * the shadow stack with its entry data size. + * + */ +static inline int __fprobe_handler(unsigned long ip, unsigned long parent_ip, + struct fprobe *fp, struct ftrace_regs *fregs, + void *data) +{ + if (!fp->entry_handler) + return 0; + return fp->entry_handler(fp, ip, parent_ip, fregs, data); +} + +static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, + struct fprobe *fp, struct ftrace_regs *fregs, + void *data) +{ + int ret; /* * This user handler is shared with other kprobes and is not expected to be * called recursively. So if any other kprobe handler is running, this will @@ -108,44 +234,182 @@ static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, */ if (unlikely(kprobe_running())) { fp->nmissed++; - goto recursion_unlock; + return 0; } kprobe_busy_begin(); - __fprobe_handler(ip, parent_ip, ops, fregs); + ret = __fprobe_handler(ip, parent_ip, fp, fregs, data); kprobe_busy_end(); - -recursion_unlock: - ftrace_test_recursion_unlock(bit); + return ret; } -static void fprobe_exit_handler(struct rethook_node *rh, void *data, - unsigned long ret_ip, struct pt_regs *regs) +static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, + struct ftrace_regs *fregs) { - struct fprobe *fp = (struct fprobe *)data; - struct fprobe_rethook_node *fpr; - int bit; + struct fprobe_hlist_node *node, *first; + unsigned long *fgraph_data = NULL; + unsigned long func = trace->func; + unsigned long ret_ip; + int reserved_words; + struct fprobe *fp; + int used, ret; - if (!fp || fprobe_disabled(fp)) - return; + if (WARN_ON_ONCE(!fregs)) + return 0; + + first = node = find_first_fprobe_node(func); + if (unlikely(!first)) + return 0; - fpr = container_of(rh, struct fprobe_rethook_node, node); + reserved_words = 0; + hlist_for_each_entry_from_rcu(node, hlist) { + if (node->addr != func) + break; + fp = READ_ONCE(node->fp); + if (!fp || !fp->exit_handler) + continue; + /* + * Since fprobe can be enabled until the next loop, we ignore the + * fprobe's disabled flag in this loop. + */ + reserved_words += + FPROBE_HEADER_SIZE_IN_LONG + SIZE_IN_LONG(fp->entry_data_size); + } + node = first; + if (reserved_words) { + fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long)); + if (unlikely(!fgraph_data)) { + hlist_for_each_entry_from_rcu(node, hlist) { + if (node->addr != func) + break; + fp = READ_ONCE(node->fp); + if (fp && !fprobe_disabled(fp)) + fp->nmissed++; + } + return 0; + } + } /* - * we need to assure no calls to traceable functions in-between the - * end of fprobe_handler and the beginning of fprobe_exit_handler. + * TODO: recursion detection has been done in the fgraph. Thus we need + * to add a callback to increment missed counter. */ - bit = ftrace_test_recursion_trylock(fpr->entry_ip, fpr->entry_parent_ip); - if (bit < 0) { - fp->nmissed++; + ret_ip = ftrace_regs_get_return_address(fregs); + used = 0; + hlist_for_each_entry_from_rcu(node, hlist) { + int data_size; + void *data; + + if (node->addr != func) + break; + fp = READ_ONCE(node->fp); + if (!fp || fprobe_disabled(fp)) + continue; + + data_size = fp->entry_data_size; + if (data_size && fp->exit_handler) + data = fgraph_data + used + FPROBE_HEADER_SIZE_IN_LONG; + else + data = NULL; + + if (fprobe_shared_with_kprobes(fp)) + ret = __fprobe_kprobe_handler(func, ret_ip, fp, fregs, data); + else + ret = __fprobe_handler(func, ret_ip, fp, fregs, data); + + /* If entry_handler returns !0, nmissed is not counted but skips exit_handler. */ + if (!ret && fp->exit_handler) { + int size_words = SIZE_IN_LONG(data_size); + + if (write_fprobe_header(&fgraph_data[used], fp, size_words)) + used += FPROBE_HEADER_SIZE_IN_LONG + size_words; + } + } + if (used < reserved_words) + memset(fgraph_data + used, 0, reserved_words - used); + + /* If any exit_handler is set, data must be used. */ + return used != 0; +} +NOKPROBE_SYMBOL(fprobe_entry); + +static void fprobe_return(struct ftrace_graph_ret *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) +{ + unsigned long *fgraph_data = NULL; + unsigned long ret_ip; + struct fprobe *fp; + int size, curr; + int size_words; + + fgraph_data = (unsigned long *)fgraph_retrieve_data(gops->idx, &size); + if (WARN_ON_ONCE(!fgraph_data)) return; + size_words = SIZE_IN_LONG(size); + ret_ip = ftrace_regs_get_instruction_pointer(fregs); + + preempt_disable(); + + curr = 0; + while (size_words > curr) { + read_fprobe_header(&fgraph_data[curr], &fp, &size); + if (!fp) + break; + curr += FPROBE_HEADER_SIZE_IN_LONG; + if (is_fprobe_still_exist(fp) && !fprobe_disabled(fp)) { + if (WARN_ON_ONCE(curr + size > size_words)) + break; + fp->exit_handler(fp, trace->func, ret_ip, fregs, + size ? fgraph_data + curr : NULL); + } + curr += size; + } + preempt_enable(); +} +NOKPROBE_SYMBOL(fprobe_return); + +static struct fgraph_ops fprobe_graph_ops = { + .entryfunc = fprobe_entry, + .retfunc = fprobe_return, +}; +static int fprobe_graph_active; + +/* Add @addrs to the ftrace filter and register fgraph if needed. */ +static int fprobe_graph_add_ips(unsigned long *addrs, int num) +{ + int ret; + + lockdep_assert_held(&fprobe_mutex); + + ret = ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 0, 0); + if (ret) + return ret; + + if (!fprobe_graph_active) { + ret = register_ftrace_graph(&fprobe_graph_ops); + if (WARN_ON_ONCE(ret)) { + ftrace_free_filter(&fprobe_graph_ops.ops); + return ret; + } } + fprobe_graph_active++; + return 0; +} - fp->exit_handler(fp, fpr->entry_ip, ret_ip, regs, - fp->entry_data_size ? (void *)fpr->data : NULL); - ftrace_test_recursion_unlock(bit); +/* Remove @addrs from the ftrace filter and unregister fgraph if possible. */ +static void fprobe_graph_remove_ips(unsigned long *addrs, int num) +{ + lockdep_assert_held(&fprobe_mutex); + + fprobe_graph_active--; + /* Q: should we unregister it ? */ + if (!fprobe_graph_active) + unregister_ftrace_graph(&fprobe_graph_ops); + + if (num) + ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0); } -NOKPROBE_SYMBOL(fprobe_exit_handler); static int symbols_cmp(const void *a, const void *b) { @@ -175,53 +439,97 @@ static unsigned long *get_ftrace_locations(const char **syms, int num) return ERR_PTR(-ENOENT); } -static void fprobe_init(struct fprobe *fp) -{ - fp->nmissed = 0; - if (fprobe_shared_with_kprobes(fp)) - fp->ops.func = fprobe_kprobe_handler; - else - fp->ops.func = fprobe_handler; - fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS; -} +struct filter_match_data { + const char *filter; + const char *notfilter; + size_t index; + size_t size; + unsigned long *addrs; +}; -static int fprobe_init_rethook(struct fprobe *fp, int num) +static int filter_match_callback(void *data, const char *name, unsigned long addr) { - int size; + struct filter_match_data *match = data; - if (!fp->exit_handler) { - fp->rethook = NULL; + if (!glob_match(match->filter, name) || + (match->notfilter && glob_match(match->notfilter, name))) return 0; - } - /* Initialize rethook if needed */ - if (fp->nr_maxactive) - num = fp->nr_maxactive; - else - num *= num_possible_cpus() * 2; - if (num <= 0) - return -EINVAL; + if (!ftrace_location(addr)) + return 0; + + if (match->addrs) + match->addrs[match->index] = addr; - size = sizeof(struct fprobe_rethook_node) + fp->entry_data_size; + match->index++; + return match->index == match->size; +} - /* Initialize rethook */ - fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler, size, num); - if (IS_ERR(fp->rethook)) - return PTR_ERR(fp->rethook); +/* + * Make IP list from the filter/no-filter glob patterns. + * Return the number of matched symbols, or -ENOENT. + */ +static int ip_list_from_filter(const char *filter, const char *notfilter, + unsigned long *addrs, size_t size) +{ + struct filter_match_data match = { .filter = filter, .notfilter = notfilter, + .index = 0, .size = size, .addrs = addrs}; + int ret; - return 0; + ret = kallsyms_on_each_symbol(filter_match_callback, &match); + if (ret < 0) + return ret; + ret = module_kallsyms_on_each_symbol(NULL, filter_match_callback, &match); + if (ret < 0) + return ret; + + return match.index ?: -ENOENT; } static void fprobe_fail_cleanup(struct fprobe *fp) { - if (!IS_ERR_OR_NULL(fp->rethook)) { - /* Don't need to cleanup rethook->handler because this is not used. */ - rethook_free(fp->rethook); - fp->rethook = NULL; + kfree(fp->hlist_array); + fp->hlist_array = NULL; +} + +/* Initialize the fprobe data structure. */ +static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num) +{ + struct fprobe_hlist *hlist_array; + unsigned long addr; + int size, i; + + if (!fp || !addrs || num <= 0) + return -EINVAL; + + size = ALIGN(fp->entry_data_size, sizeof(long)); + if (size > MAX_FPROBE_DATA_SIZE) + return -E2BIG; + fp->entry_data_size = size; + + hlist_array = kzalloc(struct_size(hlist_array, array, num), GFP_KERNEL); + if (!hlist_array) + return -ENOMEM; + + fp->nmissed = 0; + + hlist_array->size = num; + fp->hlist_array = hlist_array; + hlist_array->fp = fp; + for (i = 0; i < num; i++) { + hlist_array->array[i].fp = fp; + addr = ftrace_location(addrs[i]); + if (!addr) { + fprobe_fail_cleanup(fp); + return -ENOENT; + } + hlist_array->array[i].addr = addr; } - ftrace_free_filter(&fp->ops); + return 0; } +#define FPROBE_IPS_MAX INT_MAX + /** * register_fprobe() - Register fprobe to ftrace by pattern. * @fp: A fprobe data structure to be registered. @@ -235,46 +543,24 @@ static void fprobe_fail_cleanup(struct fprobe *fp) */ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter) { - struct ftrace_hash *hash; - unsigned char *str; - int ret, len; + unsigned long *addrs; + int ret; if (!fp || !filter) return -EINVAL; - fprobe_init(fp); - - len = strlen(filter); - str = kstrdup(filter, GFP_KERNEL); - ret = ftrace_set_filter(&fp->ops, str, len, 0); - kfree(str); - if (ret) + ret = ip_list_from_filter(filter, notfilter, NULL, FPROBE_IPS_MAX); + if (ret < 0) return ret; - if (notfilter) { - len = strlen(notfilter); - str = kstrdup(notfilter, GFP_KERNEL); - ret = ftrace_set_notrace(&fp->ops, str, len, 0); - kfree(str); - if (ret) - goto out; - } - - /* TODO: - * correctly calculate the total number of filtered symbols - * from both filter and notfilter. - */ - hash = rcu_access_pointer(fp->ops.local_hash.filter_hash); - if (WARN_ON_ONCE(!hash)) - goto out; - - ret = fprobe_init_rethook(fp, (int)hash->count); - if (!ret) - ret = register_ftrace_function(&fp->ops); + addrs = kcalloc(ret, sizeof(unsigned long), GFP_KERNEL); + if (!addrs) + return -ENOMEM; + ret = ip_list_from_filter(filter, notfilter, addrs, ret); + if (ret > 0) + ret = register_fprobe_ips(fp, addrs, ret); -out: - if (ret) - fprobe_fail_cleanup(fp); + kfree(addrs); return ret; } EXPORT_SYMBOL_GPL(register_fprobe); @@ -282,7 +568,7 @@ EXPORT_SYMBOL_GPL(register_fprobe); /** * register_fprobe_ips() - Register fprobe to ftrace by address. * @fp: A fprobe data structure to be registered. - * @addrs: An array of target ftrace location addresses. + * @addrs: An array of target function address. * @num: The number of entries of @addrs. * * Register @fp to ftrace for enabling the probe on the address given by @addrs. @@ -294,23 +580,27 @@ EXPORT_SYMBOL_GPL(register_fprobe); */ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num) { - int ret; - - if (!fp || !addrs || num <= 0) - return -EINVAL; - - fprobe_init(fp); + struct fprobe_hlist *hlist_array; + int ret, i; - ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0); + ret = fprobe_init(fp, addrs, num); if (ret) return ret; - ret = fprobe_init_rethook(fp, num); - if (!ret) - ret = register_ftrace_function(&fp->ops); + mutex_lock(&fprobe_mutex); + + hlist_array = fp->hlist_array; + ret = fprobe_graph_add_ips(addrs, num); + if (!ret) { + add_fprobe_hash(fp); + for (i = 0; i < hlist_array->size; i++) + insert_fprobe_node(&hlist_array->array[i]); + } + mutex_unlock(&fprobe_mutex); if (ret) fprobe_fail_cleanup(fp); + return ret; } EXPORT_SYMBOL_GPL(register_fprobe_ips); @@ -348,14 +638,13 @@ EXPORT_SYMBOL_GPL(register_fprobe_syms); bool fprobe_is_registered(struct fprobe *fp) { - if (!fp || (fp->ops.saved_func != fprobe_handler && - fp->ops.saved_func != fprobe_kprobe_handler)) + if (!fp || !fp->hlist_array) return false; return true; } /** - * unregister_fprobe() - Unregister fprobe from ftrace + * unregister_fprobe() - Unregister fprobe. * @fp: A fprobe data structure to be unregistered. * * Unregister fprobe (and remove ftrace hooks from the function entries). @@ -364,23 +653,40 @@ bool fprobe_is_registered(struct fprobe *fp) */ int unregister_fprobe(struct fprobe *fp) { - int ret; + struct fprobe_hlist *hlist_array; + unsigned long *addrs = NULL; + int ret = 0, i, count; - if (!fprobe_is_registered(fp)) - return -EINVAL; + mutex_lock(&fprobe_mutex); + if (!fp || !is_fprobe_still_exist(fp)) { + ret = -EINVAL; + goto out; + } + + hlist_array = fp->hlist_array; + addrs = kcalloc(hlist_array->size, sizeof(unsigned long), GFP_KERNEL); + if (!addrs) { + ret = -ENOMEM; /* TODO: Fallback to one-by-one loop */ + goto out; + } - if (!IS_ERR_OR_NULL(fp->rethook)) - rethook_stop(fp->rethook); + /* Remove non-synonim ips from table and hash */ + count = 0; + for (i = 0; i < hlist_array->size; i++) { + if (!delete_fprobe_node(&hlist_array->array[i])) + addrs[count++] = hlist_array->array[i].addr; + } + del_fprobe_hash(fp); - ret = unregister_ftrace_function(&fp->ops); - if (ret < 0) - return ret; + fprobe_graph_remove_ips(addrs, count); - if (!IS_ERR_OR_NULL(fp->rethook)) - rethook_free(fp->rethook); + kfree_rcu(hlist_array, rcu); + fp->hlist_array = NULL; - ftrace_free_filter(&fp->ops); +out: + mutex_unlock(&fprobe_mutex); + kfree(addrs); return ret; } EXPORT_SYMBOL_GPL(unregister_fprobe); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 83ba342aef31..fc88e0688daf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -74,7 +74,8 @@ #ifdef CONFIG_DYNAMIC_FTRACE #define INIT_OPS_HASH(opsname) \ .func_hash = &opsname.local_hash, \ - .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), + .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), \ + .subop_list = LIST_HEAD_INIT(opsname.subop_list), #else #define INIT_OPS_HASH(opsname) #endif @@ -99,7 +100,7 @@ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; /* What to set function_trace_op to */ static struct ftrace_ops *set_function_trace_op; -static bool ftrace_pids_enabled(struct ftrace_ops *ops) +bool ftrace_pids_enabled(struct ftrace_ops *ops) { struct trace_array *tr; @@ -121,7 +122,7 @@ static int ftrace_disabled __read_mostly; DEFINE_MUTEX(ftrace_lock); -struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end; +struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = (struct ftrace_ops __rcu *)&ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; struct ftrace_ops global_ops; @@ -161,12 +162,14 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops) #ifdef CONFIG_DYNAMIC_FTRACE if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) { mutex_init(&ops->local_hash.regex_lock); + INIT_LIST_HEAD(&ops->subop_list); ops->func_hash = &ops->local_hash; ops->flags |= FTRACE_OPS_FL_INITIALIZED; } #endif } +/* Call this function for when a callback filters on set_ftrace_pid */ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { @@ -235,8 +238,6 @@ static void update_ftrace_function(void) func = ftrace_ops_list_func; } - update_function_graph_func(); - /* If there's no change, then do nothing more here */ if (ftrace_trace_function == func) return; @@ -310,7 +311,7 @@ static int remove_ftrace_ops(struct ftrace_ops __rcu **list, lockdep_is_held(&ftrace_lock)) == ops && rcu_dereference_protected(ops->next, lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { - *list = &ftrace_list_end; + rcu_assign_pointer(*list, &ftrace_list_end); return 0; } @@ -406,6 +407,8 @@ static void ftrace_update_pid_func(void) } } while_for_each_ftrace_op(op); + fgraph_update_pid_func(); + update_ftrace_function(); } @@ -533,24 +536,22 @@ static int function_stat_show(struct seq_file *m, void *v) { struct ftrace_profile *rec = v; char str[KSYM_SYMBOL_LEN]; - int ret = 0; #ifdef CONFIG_FUNCTION_GRAPH_TRACER static struct trace_seq s; unsigned long long avg; unsigned long long stddev; + unsigned long long stddev_denom; #endif - mutex_lock(&ftrace_profile_lock); + guard(mutex)(&ftrace_profile_lock); /* we raced with function_profile_reset() */ - if (unlikely(rec->counter == 0)) { - ret = -EBUSY; - goto out; - } + if (unlikely(rec->counter == 0)) + return -EBUSY; #ifdef CONFIG_FUNCTION_GRAPH_TRACER avg = div64_ul(rec->time, rec->counter); if (tracing_thresh && (avg < tracing_thresh)) - goto out; + return 0; #endif kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); @@ -559,23 +560,19 @@ static int function_stat_show(struct seq_file *m, void *v) #ifdef CONFIG_FUNCTION_GRAPH_TRACER seq_puts(m, " "); - /* Sample standard deviation (s^2) */ - if (rec->counter <= 1) - stddev = 0; - else { - /* - * Apply Welford's method: - * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) - */ + /* + * Variance formula: + * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) + * Maybe Welford's method is better here? + * Divide only by 1000 for ns^2 -> us^2 conversion. + * trace_print_graph_duration will divide by 1000 again. + */ + stddev = 0; + stddev_denom = rec->counter * (rec->counter - 1) * 1000; + if (stddev_denom) { stddev = rec->counter * rec->time_squared - rec->time * rec->time; - - /* - * Divide only 1000 for ns^2 -> us^2 conversion. - * trace_print_graph_duration will divide 1000 again. - */ - stddev = div64_ul(stddev, - rec->counter * (rec->counter - 1) * 1000); + stddev = div64_ul(stddev, stddev_denom); } trace_seq_init(&s); @@ -587,10 +584,8 @@ static int function_stat_show(struct seq_file *m, void *v) trace_print_seq(m, &s); #endif seq_putc(m, '\n'); -out: - mutex_unlock(&ftrace_profile_lock); - return ret; + return 0; } static void ftrace_profile_reset(struct ftrace_profile_stat *stat) @@ -786,27 +781,24 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, { struct ftrace_profile_stat *stat; struct ftrace_profile *rec; - unsigned long flags; if (!ftrace_profile_enabled) return; - local_irq_save(flags); + guard(preempt_notrace)(); stat = this_cpu_ptr(&ftrace_profile_stats); if (!stat->hash || !ftrace_profile_enabled) - goto out; + return; rec = ftrace_find_profiled_func(stat, ip); if (!rec) { rec = ftrace_profile_alloc(stat, ip); if (!rec) - goto out; + return; } rec->counter++; - out: - local_irq_restore(flags); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -817,9 +809,17 @@ void ftrace_graph_graph_time_control(bool enable) fgraph_graph_time = enable; } -static int profile_graph_entry(struct ftrace_graph_ent *trace) +struct profile_fgraph_data { + unsigned long long calltime; + unsigned long long subtime; + unsigned long long sleeptime; +}; + +static int profile_graph_entry(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { - struct ftrace_ret_stack *ret_stack; + struct profile_fgraph_data *profile_data; function_profile_call(trace->func, 0, NULL, NULL); @@ -827,42 +827,57 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace) if (!current->ret_stack) return 0; - ret_stack = ftrace_graph_get_ret_stack(current, 0); - if (ret_stack) - ret_stack->subtime = 0; + profile_data = fgraph_reserve_data(gops->idx, sizeof(*profile_data)); + if (!profile_data) + return 0; + + profile_data->subtime = 0; + profile_data->sleeptime = current->ftrace_sleeptime; + profile_data->calltime = trace_clock_local(); return 1; } -static void profile_graph_return(struct ftrace_graph_ret *trace) +static void profile_graph_return(struct ftrace_graph_ret *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { - struct ftrace_ret_stack *ret_stack; + struct profile_fgraph_data *profile_data; struct ftrace_profile_stat *stat; unsigned long long calltime; + unsigned long long rettime = trace_clock_local(); struct ftrace_profile *rec; - unsigned long flags; + int size; + + guard(preempt_notrace)(); - local_irq_save(flags); stat = this_cpu_ptr(&ftrace_profile_stats); if (!stat->hash || !ftrace_profile_enabled) - goto out; + return; + + profile_data = fgraph_retrieve_data(gops->idx, &size); /* If the calltime was zero'd ignore it */ - if (!trace->calltime) - goto out; + if (!profile_data || !profile_data->calltime) + return; + + calltime = rettime - profile_data->calltime; - calltime = trace->rettime - trace->calltime; + if (!fgraph_sleep_time) { + if (current->ftrace_sleeptime) + calltime -= current->ftrace_sleeptime - profile_data->sleeptime; + } if (!fgraph_graph_time) { + struct profile_fgraph_data *parent_data; /* Append this call time to the parent time to subtract */ - ret_stack = ftrace_graph_get_ret_stack(current, 1); - if (ret_stack) - ret_stack->subtime += calltime; + parent_data = fgraph_retrieve_parent_data(gops->idx, &size, 1); + if (parent_data) + parent_data->subtime += calltime; - ret_stack = ftrace_graph_get_ret_stack(current, 0); - if (ret_stack && ret_stack->subtime < calltime) - calltime -= ret_stack->subtime; + if (profile_data->subtime && profile_data->subtime < calltime) + calltime -= profile_data->subtime; else calltime = 0; } @@ -872,9 +887,6 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) rec->time += calltime; rec->time_squared += calltime * calltime; } - - out: - local_irq_restore(flags); } static struct fgraph_ops fprofiler_ops = { @@ -884,6 +896,7 @@ static struct fgraph_ops fprofiler_ops = { static int register_ftrace_profiler(void) { + ftrace_ops_set_global_filter(&fprofiler_ops.ops); return register_ftrace_graph(&fprofiler_ops); } @@ -894,12 +907,11 @@ static void unregister_ftrace_profiler(void) #else static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, - .flags = FTRACE_OPS_FL_INITIALIZED, - INIT_OPS_HASH(ftrace_profile_ops) }; static int register_ftrace_profiler(void) { + ftrace_ops_set_global_filter(&ftrace_profile_ops); return register_ftrace_function(&ftrace_profile_ops); } @@ -922,20 +934,16 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, val = !!val; - mutex_lock(&ftrace_profile_lock); + guard(mutex)(&ftrace_profile_lock); if (ftrace_profile_enabled ^ val) { if (val) { ret = ftrace_profile_init(); - if (ret < 0) { - cnt = ret; - goto out; - } + if (ret < 0) + return ret; ret = register_ftrace_profiler(); - if (ret < 0) { - cnt = ret; - goto out; - } + if (ret < 0) + return ret; ftrace_profile_enabled = 1; } else { ftrace_profile_enabled = 0; @@ -946,8 +954,6 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, unregister_ftrace_profiler(); } } - out: - mutex_unlock(&ftrace_profile_lock); *ppos += cnt; @@ -1160,7 +1166,7 @@ __ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) * Search a given @hash to see if a given instruction pointer (@ip) * exists in it. * - * Returns the entry that holds the @ip if found. NULL otherwise. + * Returns: the entry that holds the @ip if found. NULL otherwise. */ struct ftrace_func_entry * ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) @@ -1282,7 +1288,7 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) /** * ftrace_free_filter - remove all filters for an ftrace_ops - * @ops - the ops to remove the filters from + * @ops: the ops to remove the filters from */ void ftrace_free_filter(struct ftrace_ops *ops) { @@ -1314,7 +1320,7 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits) return hash; } - +/* Used to save filters on functions for modules not loaded yet */ static int ftrace_add_mod(struct trace_array *tr, const char *func, const char *module, int enable) @@ -1380,15 +1386,17 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) return NULL; } -static void -ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash); -static void -ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); +static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops); +static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops); static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, struct ftrace_hash *new_hash); -static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size) +/* + * Allocate a new hash and remove entries from @src and move them to the new hash. + * On success, the @src hash will be empty and should be freed. + */ +static struct ftrace_hash *__move_hash(struct ftrace_hash *src, int size) { struct ftrace_func_entry *entry; struct ftrace_hash *new_hash; @@ -1424,6 +1432,7 @@ static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size) return new_hash; } +/* Move the @src entries to a newly allocated hash */ static struct ftrace_hash * __ftrace_hash_move(struct ftrace_hash *src) { @@ -1435,9 +1444,29 @@ __ftrace_hash_move(struct ftrace_hash *src) if (ftrace_hash_empty(src)) return EMPTY_HASH; - return dup_hash(src, size); + return __move_hash(src, size); } +/** + * ftrace_hash_move - move a new hash to a filter and do updates + * @ops: The ops with the hash that @dst points to + * @enable: True if for the filter hash, false for the notrace hash + * @dst: Points to the @ops hash that should be updated + * @src: The hash to update @dst with + * + * This is called when an ftrace_ops hash is being updated and the + * the kernel needs to reflect this. Note, this only updates the kernel + * function callbacks if the @ops is enabled (not to be confused with + * @enable above). If the @ops is enabled, its hash determines what + * callbacks get called. This function gets called when the @ops hash + * is updated and it requires new callbacks. + * + * On success the elements of @src is moved to @dst, and @dst is updated + * properly, as well as the functions determined by the @ops hashes + * are now calling the @ops callback function. + * + * Regardless of return type, @src should be freed with free_ftrace_hash(). + */ static int ftrace_hash_move(struct ftrace_ops *ops, int enable, struct ftrace_hash **dst, struct ftrace_hash *src) @@ -1467,11 +1496,11 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, * Remove the current set, update the hash and add * them back. */ - ftrace_hash_rec_disable_modify(ops, enable); + ftrace_hash_rec_disable_modify(ops); rcu_assign_pointer(*dst, new_hash); - ftrace_hash_rec_enable_modify(ops, enable); + ftrace_hash_rec_enable_modify(ops); return 0; } @@ -1587,7 +1616,7 @@ static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end) * @end: end of range to search (inclusive). @end points to the last byte * to check. * - * Returns rec->ip if the related ftrace location is a least partly within + * Returns: rec->ip if the related ftrace location is a least partly within * the given address range. That is, the first address of the instruction * that is either a NOP or call to the function tracer. It checks the ftrace * internal tables to determine if the address belongs or not. @@ -1595,43 +1624,42 @@ static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end) unsigned long ftrace_location_range(unsigned long start, unsigned long end) { struct dyn_ftrace *rec; + unsigned long ip = 0; + rcu_read_lock(); rec = lookup_rec(start, end); if (rec) - return rec->ip; + ip = rec->ip; + rcu_read_unlock(); - return 0; + return ip; } /** * ftrace_location - return the ftrace location * @ip: the instruction pointer to check * - * If @ip matches the ftrace location, return @ip. - * If @ip matches sym+0, return sym's ftrace location. - * Otherwise, return 0. + * Returns: + * * If @ip matches the ftrace location, return @ip. + * * If @ip matches sym+0, return sym's ftrace location. + * * Otherwise, return 0. */ unsigned long ftrace_location(unsigned long ip) { - struct dyn_ftrace *rec; + unsigned long loc; unsigned long offset; unsigned long size; - rec = lookup_rec(ip, ip); - if (!rec) { + loc = ftrace_location_range(ip, ip); + if (!loc) { if (!kallsyms_lookup_size_offset(ip, &size, &offset)) - goto out; + return 0; /* map sym+0 to __fentry__ */ if (!offset) - rec = lookup_rec(ip, ip + size - 1); + loc = ftrace_location_range(ip, ip + size - 1); } - - if (rec) - return rec->ip; - -out: - return 0; + return loc; } /** @@ -1639,7 +1667,7 @@ out: * @start: start of range to search * @end: end of range to search (inclusive). @end points to the last byte to check. * - * Returns 1 if @start and @end contains a ftrace location. + * Returns: 1 if @start and @end contains a ftrace location. * That is, the instruction that is either a NOP or call to * the function tracer. It checks the ftrace internal tables to * determine if the address belongs or not. @@ -1693,12 +1721,21 @@ static bool skip_record(struct dyn_ftrace *rec) !(rec->flags & FTRACE_FL_ENABLED); } +/* + * This is the main engine to the ftrace updates to the dyn_ftrace records. + * + * It will iterate through all the available ftrace functions + * (the ones that ftrace can have callbacks to) and set the flags + * in the associated dyn_ftrace records. + * + * @inc: If true, the functions associated to @ops are added to + * the dyn_ftrace records, otherwise they are removed. + */ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, - int filter_hash, bool inc) { struct ftrace_hash *hash; - struct ftrace_hash *other_hash; + struct ftrace_hash *notrace_hash; struct ftrace_page *pg; struct dyn_ftrace *rec; bool update = false; @@ -1710,35 +1747,16 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, return false; /* - * In the filter_hash case: * If the count is zero, we update all records. * Otherwise we just update the items in the hash. - * - * In the notrace_hash case: - * We enable the update in the hash. - * As disabling notrace means enabling the tracing, - * and enabling notrace means disabling, the inc variable - * gets inversed. */ - if (filter_hash) { - hash = ops->func_hash->filter_hash; - other_hash = ops->func_hash->notrace_hash; - if (ftrace_hash_empty(hash)) - all = true; - } else { - inc = !inc; - hash = ops->func_hash->notrace_hash; - other_hash = ops->func_hash->filter_hash; - /* - * If the notrace hash has no items, - * then there's nothing to do. - */ - if (ftrace_hash_empty(hash)) - return false; - } + hash = ops->func_hash->filter_hash; + notrace_hash = ops->func_hash->notrace_hash; + if (ftrace_hash_empty(hash)) + all = true; do_for_each_ftrace_rec(pg, rec) { - int in_other_hash = 0; + int in_notrace_hash = 0; int in_hash = 0; int match = 0; @@ -1750,26 +1768,17 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, * Only the filter_hash affects all records. * Update if the record is not in the notrace hash. */ - if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) + if (!notrace_hash || !ftrace_lookup_ip(notrace_hash, rec->ip)) match = 1; } else { in_hash = !!ftrace_lookup_ip(hash, rec->ip); - in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); + in_notrace_hash = !!ftrace_lookup_ip(notrace_hash, rec->ip); /* - * If filter_hash is set, we want to match all functions - * that are in the hash but not in the other hash. - * - * If filter_hash is not set, then we are decrementing. - * That means we match anything that is in the hash - * and also in the other_hash. That is, we need to turn - * off functions in the other hash because they are disabled - * by this hash. + * We want to match all functions that are in the hash but + * not in the other hash. */ - if (filter_hash && in_hash && !in_other_hash) - match = 1; - else if (!filter_hash && in_hash && - (in_other_hash || ftrace_hash_empty(other_hash))) + if (in_hash && !in_notrace_hash) match = 1; } if (!match) @@ -1875,24 +1884,48 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, return update; } -static bool ftrace_hash_rec_disable(struct ftrace_ops *ops, - int filter_hash) +/* + * This is called when an ops is removed from tracing. It will decrement + * the counters of the dyn_ftrace records for all the functions that + * the @ops attached to. + */ +static bool ftrace_hash_rec_disable(struct ftrace_ops *ops) { - return __ftrace_hash_rec_update(ops, filter_hash, 0); + return __ftrace_hash_rec_update(ops, false); } -static bool ftrace_hash_rec_enable(struct ftrace_ops *ops, - int filter_hash) +/* + * This is called when an ops is added to tracing. It will increment + * the counters of the dyn_ftrace records for all the functions that + * the @ops attached to. + */ +static bool ftrace_hash_rec_enable(struct ftrace_ops *ops) { - return __ftrace_hash_rec_update(ops, filter_hash, 1); + return __ftrace_hash_rec_update(ops, true); } -static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops, - int filter_hash, int inc) +/* + * This function will update what functions @ops traces when its filter + * changes. + * + * The @inc states if the @ops callbacks are going to be added or removed. + * When one of the @ops hashes are updated to a "new_hash" the dyn_ftrace + * records are update via: + * + * ftrace_hash_rec_disable_modify(ops); + * ops->hash = new_hash + * ftrace_hash_rec_enable_modify(ops); + * + * Where the @ops is removed from all the records it is tracing using + * its old hash. The @ops hash is updated to the new hash, and then + * the @ops is added back to the records so that it is tracing all + * the new functions. + */ +static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops, bool inc) { struct ftrace_ops *op; - __ftrace_hash_rec_update(ops, filter_hash, inc); + __ftrace_hash_rec_update(ops, inc); if (ops->func_hash != &global_ops.local_hash) return; @@ -1906,20 +1939,18 @@ static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops, if (op == ops) continue; if (op->func_hash == &global_ops.local_hash) - __ftrace_hash_rec_update(op, filter_hash, inc); + __ftrace_hash_rec_update(op, inc); } while_for_each_ftrace_op(op); } -static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, - int filter_hash) +static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops) { - ftrace_hash_rec_update_modify(ops, filter_hash, 0); + ftrace_hash_rec_update_modify(ops, false); } -static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, - int filter_hash) +static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops) { - ftrace_hash_rec_update_modify(ops, filter_hash, 1); + ftrace_hash_rec_update_modify(ops, true); } /* @@ -2022,7 +2053,7 @@ rollback: continue; if (rec == end) - goto err_out; + return -EBUSY; in_old = !!ftrace_lookup_ip(old_hash, rec->ip); in_new = !!ftrace_lookup_ip(new_hash, rec->ip); @@ -2035,7 +2066,6 @@ rollback: rec->flags |= FTRACE_FL_IPMODIFY; } while_for_each_ftrace_rec(); -err_out: return -EBUSY; } @@ -2537,7 +2567,6 @@ ftrace_find_unique_ops(struct dyn_ftrace *rec) /* Protected by rcu_tasks for reading, and direct_mutex for writing */ static struct ftrace_hash __rcu *direct_functions = EMPTY_HASH; static DEFINE_MUTEX(direct_mutex); -int ftrace_direct_func_count; /* * Search the direct_functions hash to see if the given instruction pointer @@ -2574,7 +2603,7 @@ static void call_direct_funcs(unsigned long ip, unsigned long pip, * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS * is not set, then it wants to convert to the normal callback. * - * Returns the address of the trampoline to set to + * Returns: the address of the trampoline to set to */ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) { @@ -2615,7 +2644,7 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) * a function that saves all the regs. Basically the '_EN' version * represents the current state of the function. * - * Returns the address of the trampoline that is currently being called + * Returns: the address of the trampoline that is currently being called */ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) { @@ -2719,7 +2748,7 @@ struct ftrace_rec_iter { /** * ftrace_rec_iter_start - start up iterating over traced functions * - * Returns an iterator handle that is used to iterate over all + * Returns: an iterator handle that is used to iterate over all * the records that represent address locations where functions * are traced. * @@ -2751,7 +2780,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_start(void) * ftrace_rec_iter_next - get the next record to process. * @iter: The handle to the iterator. * - * Returns the next iterator after the given iterator @iter. + * Returns: the next iterator after the given iterator @iter. */ struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter) { @@ -2776,7 +2805,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter) * ftrace_rec_iter_record - get the record at the iterator location * @iter: The current iterator location * - * Returns the record that the current @iter is at. + * Returns: the record that the current @iter is at. */ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter) { @@ -3043,7 +3072,7 @@ int ftrace_startup(struct ftrace_ops *ops, int command) return ret; } - if (ftrace_hash_rec_enable(ops, 1)) + if (ftrace_hash_rec_enable(ops)) command |= FTRACE_UPDATE_CALLS; ftrace_startup_enable(command); @@ -3085,7 +3114,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) /* Disabling ipmodify never fails */ ftrace_hash_ipmodify_disable(ops); - if (ftrace_hash_rec_disable(ops, 1)) + if (ftrace_hash_rec_disable(ops)) command |= FTRACE_UPDATE_CALLS; ops->flags &= ~FTRACE_OPS_FL_ENABLED; @@ -3156,8 +3185,7 @@ out: * synchronize_rcu_tasks() will wait for those tasks to * execute and either schedule voluntarily or enter user space. */ - if (IS_ENABLED(CONFIG_PREEMPTION)) - synchronize_rcu_tasks(); + synchronize_rcu_tasks(); ftrace_trampoline_free(ops); } @@ -3165,7 +3193,487 @@ out: return 0; } -static u64 ftrace_update_time; +/* Simply make a copy of @src and return it */ +static struct ftrace_hash *copy_hash(struct ftrace_hash *src) +{ + if (ftrace_hash_empty(src)) + return EMPTY_HASH; + + return alloc_and_copy_ftrace_hash(src->size_bits, src); +} + +/* + * Append @new_hash entries to @hash: + * + * If @hash is the EMPTY_HASH then it traces all functions and nothing + * needs to be done. + * + * If @new_hash is the EMPTY_HASH, then make *hash the EMPTY_HASH so + * that it traces everything. + * + * Otherwise, go through all of @new_hash and add anything that @hash + * doesn't already have, to @hash. + * + * The filter_hash updates uses just the append_hash() function + * and the notrace_hash does not. + */ +static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash, + int size_bits) +{ + struct ftrace_func_entry *entry; + int size; + int i; + + if (*hash) { + /* An empty hash does everything */ + if (ftrace_hash_empty(*hash)) + return 0; + } else { + *hash = alloc_ftrace_hash(size_bits); + if (!*hash) + return -ENOMEM; + } + + /* If new_hash has everything make hash have everything */ + if (ftrace_hash_empty(new_hash)) { + free_ftrace_hash(*hash); + *hash = EMPTY_HASH; + return 0; + } + + size = 1 << new_hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &new_hash->buckets[i], hlist) { + /* Only add if not already in hash */ + if (!__ftrace_lookup_ip(*hash, entry->ip) && + add_hash_entry(*hash, entry->ip) == NULL) + return -ENOMEM; + } + } + return 0; +} + +/* + * Add to @hash only those that are in both @new_hash1 and @new_hash2 + * + * The notrace_hash updates uses just the intersect_hash() function + * and the filter_hash does not. + */ +static int intersect_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash1, + struct ftrace_hash *new_hash2) +{ + struct ftrace_func_entry *entry; + int size; + int i; + + /* + * If new_hash1 or new_hash2 is the EMPTY_HASH then make the hash + * empty as well as empty for notrace means none are notraced. + */ + if (ftrace_hash_empty(new_hash1) || ftrace_hash_empty(new_hash2)) { + free_ftrace_hash(*hash); + *hash = EMPTY_HASH; + return 0; + } + + size = 1 << new_hash1->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &new_hash1->buckets[i], hlist) { + /* Only add if in both @new_hash1 and @new_hash2 */ + if (__ftrace_lookup_ip(new_hash2, entry->ip) && + add_hash_entry(*hash, entry->ip) == NULL) + return -ENOMEM; + } + } + /* If nothing intersects, make it the empty set */ + if (ftrace_hash_empty(*hash)) { + free_ftrace_hash(*hash); + *hash = EMPTY_HASH; + } + return 0; +} + +/* Return a new hash that has a union of all @ops->filter_hash entries */ +static struct ftrace_hash *append_hashes(struct ftrace_ops *ops) +{ + struct ftrace_hash *new_hash = NULL; + struct ftrace_ops *subops; + int size_bits; + int ret; + + if (ops->func_hash->filter_hash) + size_bits = ops->func_hash->filter_hash->size_bits; + else + size_bits = FTRACE_HASH_DEFAULT_BITS; + + list_for_each_entry(subops, &ops->subop_list, list) { + ret = append_hash(&new_hash, subops->func_hash->filter_hash, size_bits); + if (ret < 0) { + free_ftrace_hash(new_hash); + return NULL; + } + /* Nothing more to do if new_hash is empty */ + if (ftrace_hash_empty(new_hash)) + break; + } + /* Can't return NULL as that means this failed */ + return new_hash ? : EMPTY_HASH; +} + +/* Make @ops trace evenything except what all its subops do not trace */ +static struct ftrace_hash *intersect_hashes(struct ftrace_ops *ops) +{ + struct ftrace_hash *new_hash = NULL; + struct ftrace_ops *subops; + int size_bits; + int ret; + + list_for_each_entry(subops, &ops->subop_list, list) { + struct ftrace_hash *next_hash; + + if (!new_hash) { + size_bits = subops->func_hash->notrace_hash->size_bits; + new_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->notrace_hash); + if (!new_hash) + return NULL; + continue; + } + size_bits = new_hash->size_bits; + next_hash = new_hash; + new_hash = alloc_ftrace_hash(size_bits); + ret = intersect_hash(&new_hash, next_hash, subops->func_hash->notrace_hash); + free_ftrace_hash(next_hash); + if (ret < 0) { + free_ftrace_hash(new_hash); + return NULL; + } + /* Nothing more to do if new_hash is empty */ + if (ftrace_hash_empty(new_hash)) + break; + } + return new_hash; +} + +static bool ops_equal(struct ftrace_hash *A, struct ftrace_hash *B) +{ + struct ftrace_func_entry *entry; + int size; + int i; + + if (ftrace_hash_empty(A)) + return ftrace_hash_empty(B); + + if (ftrace_hash_empty(B)) + return ftrace_hash_empty(A); + + if (A->count != B->count) + return false; + + size = 1 << A->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &A->buckets[i], hlist) { + if (!__ftrace_lookup_ip(B, entry->ip)) + return false; + } + } + + return true; +} + +static void ftrace_ops_update_code(struct ftrace_ops *ops, + struct ftrace_ops_hash *old_hash); + +static int __ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, + struct ftrace_hash **orig_hash, + struct ftrace_hash *hash, + int enable) +{ + struct ftrace_ops_hash old_hash_ops; + struct ftrace_hash *old_hash; + int ret; + + old_hash = *orig_hash; + old_hash_ops.filter_hash = ops->func_hash->filter_hash; + old_hash_ops.notrace_hash = ops->func_hash->notrace_hash; + ret = ftrace_hash_move(ops, enable, orig_hash, hash); + if (!ret) { + ftrace_ops_update_code(ops, &old_hash_ops); + free_ftrace_hash_rcu(old_hash); + } + return ret; +} + +static int ftrace_update_ops(struct ftrace_ops *ops, struct ftrace_hash *filter_hash, + struct ftrace_hash *notrace_hash) +{ + int ret; + + if (!ops_equal(filter_hash, ops->func_hash->filter_hash)) { + ret = __ftrace_hash_move_and_update_ops(ops, &ops->func_hash->filter_hash, + filter_hash, 1); + if (ret < 0) + return ret; + } + + if (!ops_equal(notrace_hash, ops->func_hash->notrace_hash)) { + ret = __ftrace_hash_move_and_update_ops(ops, &ops->func_hash->notrace_hash, + notrace_hash, 0); + if (ret < 0) + return ret; + } + + return 0; +} + +/** + * ftrace_startup_subops - enable tracing for subops of an ops + * @ops: Manager ops (used to pick all the functions of its subops) + * @subops: A new ops to add to @ops + * @command: Extra commands to use to enable tracing + * + * The @ops is a manager @ops that has the filter that includes all the functions + * that its list of subops are tracing. Adding a new @subops will add the + * functions of @subops to @ops. + */ +int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command) +{ + struct ftrace_hash *filter_hash; + struct ftrace_hash *notrace_hash; + struct ftrace_hash *save_filter_hash; + struct ftrace_hash *save_notrace_hash; + int size_bits; + int ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + ftrace_ops_init(ops); + ftrace_ops_init(subops); + + if (WARN_ON_ONCE(subops->flags & FTRACE_OPS_FL_ENABLED)) + return -EBUSY; + + /* Make everything canonical (Just in case!) */ + if (!ops->func_hash->filter_hash) + ops->func_hash->filter_hash = EMPTY_HASH; + if (!ops->func_hash->notrace_hash) + ops->func_hash->notrace_hash = EMPTY_HASH; + if (!subops->func_hash->filter_hash) + subops->func_hash->filter_hash = EMPTY_HASH; + if (!subops->func_hash->notrace_hash) + subops->func_hash->notrace_hash = EMPTY_HASH; + + /* For the first subops to ops just enable it normally */ + if (list_empty(&ops->subop_list)) { + /* Just use the subops hashes */ + filter_hash = copy_hash(subops->func_hash->filter_hash); + notrace_hash = copy_hash(subops->func_hash->notrace_hash); + if (!filter_hash || !notrace_hash) { + free_ftrace_hash(filter_hash); + free_ftrace_hash(notrace_hash); + return -ENOMEM; + } + + save_filter_hash = ops->func_hash->filter_hash; + save_notrace_hash = ops->func_hash->notrace_hash; + + ops->func_hash->filter_hash = filter_hash; + ops->func_hash->notrace_hash = notrace_hash; + list_add(&subops->list, &ops->subop_list); + ret = ftrace_startup(ops, command); + if (ret < 0) { + list_del(&subops->list); + ops->func_hash->filter_hash = save_filter_hash; + ops->func_hash->notrace_hash = save_notrace_hash; + free_ftrace_hash(filter_hash); + free_ftrace_hash(notrace_hash); + } else { + free_ftrace_hash(save_filter_hash); + free_ftrace_hash(save_notrace_hash); + subops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_SUBOP; + subops->managed = ops; + } + return ret; + } + + /* + * Here there's already something attached. Here are the rules: + * o If either filter_hash is empty then the final stays empty + * o Otherwise, the final is a superset of both hashes + * o If either notrace_hash is empty then the final stays empty + * o Otherwise, the final is an intersection between the hashes + */ + if (ftrace_hash_empty(ops->func_hash->filter_hash) || + ftrace_hash_empty(subops->func_hash->filter_hash)) { + filter_hash = EMPTY_HASH; + } else { + size_bits = max(ops->func_hash->filter_hash->size_bits, + subops->func_hash->filter_hash->size_bits); + filter_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->filter_hash); + if (!filter_hash) + return -ENOMEM; + ret = append_hash(&filter_hash, subops->func_hash->filter_hash, + size_bits); + if (ret < 0) { + free_ftrace_hash(filter_hash); + return ret; + } + } + + if (ftrace_hash_empty(ops->func_hash->notrace_hash) || + ftrace_hash_empty(subops->func_hash->notrace_hash)) { + notrace_hash = EMPTY_HASH; + } else { + size_bits = max(ops->func_hash->filter_hash->size_bits, + subops->func_hash->filter_hash->size_bits); + notrace_hash = alloc_ftrace_hash(size_bits); + if (!notrace_hash) { + free_ftrace_hash(filter_hash); + return -ENOMEM; + } + + ret = intersect_hash(¬race_hash, ops->func_hash->filter_hash, + subops->func_hash->filter_hash); + if (ret < 0) { + free_ftrace_hash(filter_hash); + free_ftrace_hash(notrace_hash); + return ret; + } + } + + list_add(&subops->list, &ops->subop_list); + + ret = ftrace_update_ops(ops, filter_hash, notrace_hash); + free_ftrace_hash(filter_hash); + free_ftrace_hash(notrace_hash); + if (ret < 0) { + list_del(&subops->list); + } else { + subops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_SUBOP; + subops->managed = ops; + } + return ret; +} + +/** + * ftrace_shutdown_subops - Remove a subops from a manager ops + * @ops: A manager ops to remove @subops from + * @subops: The subops to remove from @ops + * @command: Any extra command flags to add to modifying the text + * + * Removes the functions being traced by the @subops from @ops. Note, it + * will not affect functions that are being traced by other subops that + * still exist in @ops. + * + * If the last subops is removed from @ops, then @ops is shutdown normally. + */ +int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command) +{ + struct ftrace_hash *filter_hash; + struct ftrace_hash *notrace_hash; + int ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + if (WARN_ON_ONCE(!(subops->flags & FTRACE_OPS_FL_ENABLED))) + return -EINVAL; + + list_del(&subops->list); + + if (list_empty(&ops->subop_list)) { + /* Last one, just disable the current ops */ + + ret = ftrace_shutdown(ops, command); + if (ret < 0) { + list_add(&subops->list, &ops->subop_list); + return ret; + } + + subops->flags &= ~FTRACE_OPS_FL_ENABLED; + + free_ftrace_hash(ops->func_hash->filter_hash); + free_ftrace_hash(ops->func_hash->notrace_hash); + ops->func_hash->filter_hash = EMPTY_HASH; + ops->func_hash->notrace_hash = EMPTY_HASH; + subops->flags &= ~(FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_SUBOP); + subops->managed = NULL; + + return 0; + } + + /* Rebuild the hashes without subops */ + filter_hash = append_hashes(ops); + notrace_hash = intersect_hashes(ops); + if (!filter_hash || !notrace_hash) { + free_ftrace_hash(filter_hash); + free_ftrace_hash(notrace_hash); + list_add(&subops->list, &ops->subop_list); + return -ENOMEM; + } + + ret = ftrace_update_ops(ops, filter_hash, notrace_hash); + if (ret < 0) { + list_add(&subops->list, &ops->subop_list); + } else { + subops->flags &= ~(FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_SUBOP); + subops->managed = NULL; + } + free_ftrace_hash(filter_hash); + free_ftrace_hash(notrace_hash); + return ret; +} + +static int ftrace_hash_move_and_update_subops(struct ftrace_ops *subops, + struct ftrace_hash **orig_subhash, + struct ftrace_hash *hash, + int enable) +{ + struct ftrace_ops *ops = subops->managed; + struct ftrace_hash **orig_hash; + struct ftrace_hash *save_hash; + struct ftrace_hash *new_hash; + int ret; + + /* Manager ops can not be subops (yet) */ + if (WARN_ON_ONCE(!ops || ops->flags & FTRACE_OPS_FL_SUBOP)) + return -EINVAL; + + /* Move the new hash over to the subops hash */ + save_hash = *orig_subhash; + *orig_subhash = __ftrace_hash_move(hash); + if (!*orig_subhash) { + *orig_subhash = save_hash; + return -ENOMEM; + } + + /* Create a new_hash to hold the ops new functions */ + if (enable) { + orig_hash = &ops->func_hash->filter_hash; + new_hash = append_hashes(ops); + } else { + orig_hash = &ops->func_hash->notrace_hash; + new_hash = intersect_hashes(ops); + } + + /* Move the hash over to the new hash */ + ret = __ftrace_hash_move_and_update_ops(ops, orig_hash, new_hash, enable); + + free_ftrace_hash(new_hash); + + if (ret) { + /* Put back the original hash */ + free_ftrace_hash_rcu(*orig_subhash); + *orig_subhash = save_hash; + } else { + free_ftrace_hash_rcu(save_hash); + } + return ret; +} + + +u64 ftrace_update_time; +u64 ftrace_total_mod_time; unsigned long ftrace_update_tot_cnt; unsigned long ftrace_number_of_pages; unsigned long ftrace_number_of_groups; @@ -3185,7 +3693,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) bool init_nop = ftrace_need_init_nop(); struct ftrace_page *pg; struct dyn_ftrace *p; - u64 start, stop; + u64 start, stop, update_time; unsigned long update_cnt = 0; unsigned long rec_flags = 0; int i; @@ -3229,7 +3737,11 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) } stop = ftrace_now(raw_smp_processor_id()); - ftrace_update_time = stop - start; + update_time = stop - start; + if (mod) + ftrace_total_mod_time += update_time; + else + ftrace_update_time = update_time; ftrace_update_tot_cnt += update_cnt; return 0; @@ -4010,6 +4522,8 @@ ftrace_avail_addrs_open(struct inode *inode, struct file *file) * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. * tracing_lseek() should be used as the lseek routine, and * release must call ftrace_regex_release(). + * + * Returns: 0 on success or a negative errno value on failure */ int ftrace_regex_open(struct ftrace_ops *ops, int flag, @@ -4199,12 +4713,12 @@ static int add_rec_by_index(struct ftrace_hash *hash, struct ftrace_glob *func_g, int clear_filter) { - long index = simple_strtoul(func_g->search, NULL, 0); + long index; struct ftrace_page *pg; struct dyn_ftrace *rec; /* The index starts at 1 */ - if (--index < 0) + if (kstrtoul(func_g->search, 0, &index) || --index < 0) return 0; do_for_each_ftrace_rec(pg, rec) { @@ -4306,15 +4820,13 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) mod_g.len = strlen(mod_g.search); } - mutex_lock(&ftrace_lock); + guard(mutex)(&ftrace_lock); if (unlikely(ftrace_disabled)) - goto out_unlock; + return 0; - if (func_g.type == MATCH_INDEX) { - found = add_rec_by_index(hash, &func_g, clear_filter); - goto out_unlock; - } + if (func_g.type == MATCH_INDEX) + return add_rec_by_index(hash, &func_g, clear_filter); do_for_each_ftrace_rec(pg, rec) { @@ -4323,16 +4835,12 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) { ret = enter_record(hash, rec, clear_filter); - if (ret < 0) { - found = ret; - goto out_unlock; - } + if (ret < 0) + return ret; found = 1; } cond_resched(); } while_for_each_ftrace_rec(); - out_unlock: - mutex_unlock(&ftrace_lock); return found; } @@ -4379,36 +4887,33 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, struct ftrace_hash *hash, int enable) { - struct ftrace_ops_hash old_hash_ops; - struct ftrace_hash *old_hash; - int ret; - - old_hash = *orig_hash; - old_hash_ops.filter_hash = ops->func_hash->filter_hash; - old_hash_ops.notrace_hash = ops->func_hash->notrace_hash; - ret = ftrace_hash_move(ops, enable, orig_hash, hash); - if (!ret) { - ftrace_ops_update_code(ops, &old_hash_ops); - free_ftrace_hash_rcu(old_hash); - } - return ret; -} + if (ops->flags & FTRACE_OPS_FL_SUBOP) + return ftrace_hash_move_and_update_subops(ops, orig_hash, hash, enable); -static bool module_exists(const char *module) -{ - /* All modules have the symbol __this_module */ - static const char this_mod[] = "__this_module"; - char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2]; - unsigned long val; - int n; + /* + * If this ops is not enabled, it could be sharing its filters + * with a subop. If that's the case, update the subop instead of + * this ops. Shared filters are only allowed to have one ops set + * at a time, and if we update the ops that is not enabled, + * it will not affect subops that share it. + */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) { + struct ftrace_ops *op; - n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod); + /* Check if any other manager subops maps to this hash */ + do_for_each_ftrace_op(op, ftrace_ops_list) { + struct ftrace_ops *subops; - if (n > sizeof(modname) - 1) - return false; + list_for_each_entry(subops, &op->subop_list, list) { + if ((subops->flags & FTRACE_OPS_FL_ENABLED) && + subops->func_hash == ops->func_hash) { + return ftrace_hash_move_and_update_subops(subops, orig_hash, hash, enable); + } + } + } while_for_each_ftrace_op(op); + } - val = module_kallsyms_lookup_name(modname); - return val != 0; + return __ftrace_hash_move_and_update_ops(ops, orig_hash, hash, enable); } static int cache_mod(struct trace_array *tr, @@ -4416,14 +4921,14 @@ static int cache_mod(struct trace_array *tr, { struct ftrace_mod_load *ftrace_mod, *n; struct list_head *head = enable ? &tr->mod_trace : &tr->mod_notrace; - int ret; - mutex_lock(&ftrace_lock); + guard(mutex)(&ftrace_lock); /* We do not cache inverse filters */ if (func[0] == '!') { + int ret = -EINVAL; + func++; - ret = -EINVAL; /* Look to remove this hash */ list_for_each_entry_safe(ftrace_mod, n, head, list) { @@ -4439,26 +4944,17 @@ static int cache_mod(struct trace_array *tr, continue; } } - goto out; + return ret; } - ret = -EINVAL; /* We only care about modules that have not been loaded yet */ if (module_exists(module)) - goto out; + return -EINVAL; /* Save this string off, and execute it when the module is loaded */ - ret = ftrace_add_mod(tr, func, module, enable); - out: - mutex_unlock(&ftrace_lock); - - return ret; + return ftrace_add_mod(tr, func, module, enable); } -static int -ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, - int reset, int enable); - #ifdef CONFIG_MODULES static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, char *mod, bool enable) @@ -4562,6 +5058,9 @@ ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash, char *func; int ret; + if (!tr) + return -ENODEV; + /* match_records() modifies func, and we need the original */ func = kstrdup(func_orig, GFP_KERNEL); if (!func) @@ -4626,7 +5125,7 @@ struct ftrace_func_mapper { /** * allocate_ftrace_func_mapper - allocate a new ftrace_func_mapper * - * Returns a ftrace_func_mapper descriptor that can be used to map ips to data. + * Returns: a ftrace_func_mapper descriptor that can be used to map ips to data. */ struct ftrace_func_mapper *allocate_ftrace_func_mapper(void) { @@ -4646,7 +5145,7 @@ struct ftrace_func_mapper *allocate_ftrace_func_mapper(void) * @mapper: The mapper that has the ip maps * @ip: the instruction pointer to find the data for * - * Returns the data mapped to @ip if found otherwise NULL. The return + * Returns: the data mapped to @ip if found otherwise NULL. The return * is actually the address of the mapper data pointer. The address is * returned for use cases where the data is no bigger than a long, and * the user can use the data pointer as its data instead of having to @@ -4672,7 +5171,7 @@ void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper, * @ip: The instruction pointer address to map @data to * @data: The data to map to @ip * - * Returns 0 on success otherwise an error. + * Returns: 0 on success otherwise an error. */ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, unsigned long ip, void *data) @@ -4701,7 +5200,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, * @mapper: The mapper that has the ip maps * @ip: The instruction pointer address to remove the data from * - * Returns the data if it is found, otherwise NULL. + * Returns: the data if it is found, otherwise NULL. * Note, if the data pointer is used as the data itself, (see * ftrace_func_mapper_find_ip(), then the return value may be meaningless, * if the data pointer was set to zero. @@ -4762,7 +5261,7 @@ static void release_probe(struct ftrace_func_probe *probe) { struct ftrace_probe_ops *probe_ops; - mutex_lock(&ftrace_lock); + guard(mutex)(&ftrace_lock); WARN_ON(probe->ref <= 0); @@ -4780,7 +5279,6 @@ static void release_probe(struct ftrace_func_probe *probe) list_del(&probe->list); kfree(probe); } - mutex_unlock(&ftrace_lock); } static void acquire_probe_locked(struct ftrace_func_probe *probe) @@ -5086,20 +5584,15 @@ static DEFINE_MUTEX(ftrace_cmd_mutex); __init int register_ftrace_command(struct ftrace_func_command *cmd) { struct ftrace_func_command *p; - int ret = 0; - mutex_lock(&ftrace_cmd_mutex); + guard(mutex)(&ftrace_cmd_mutex); list_for_each_entry(p, &ftrace_commands, list) { - if (strcmp(cmd->name, p->name) == 0) { - ret = -EBUSY; - goto out_unlock; - } + if (strcmp(cmd->name, p->name) == 0) + return -EBUSY; } list_add(&cmd->list, &ftrace_commands); - out_unlock: - mutex_unlock(&ftrace_cmd_mutex); - return ret; + return 0; } /* @@ -5109,20 +5602,17 @@ __init int register_ftrace_command(struct ftrace_func_command *cmd) __init int unregister_ftrace_command(struct ftrace_func_command *cmd) { struct ftrace_func_command *p, *n; - int ret = -ENODEV; - mutex_lock(&ftrace_cmd_mutex); + guard(mutex)(&ftrace_cmd_mutex); + list_for_each_entry_safe(p, n, &ftrace_commands, list) { if (strcmp(cmd->name, p->name) == 0) { - ret = 0; list_del_init(&p->list); - goto out_unlock; + return 0; } } - out_unlock: - mutex_unlock(&ftrace_cmd_mutex); - return ret; + return -ENODEV; } static int ftrace_process_regex(struct ftrace_iterator *iter, @@ -5132,7 +5622,7 @@ static int ftrace_process_regex(struct ftrace_iterator *iter, struct trace_array *tr = iter->ops->private; char *func, *command, *next = buff; struct ftrace_func_command *p; - int ret = -EINVAL; + int ret; func = strsep(&next, ":"); @@ -5149,17 +5639,14 @@ static int ftrace_process_regex(struct ftrace_iterator *iter, command = strsep(&next, ":"); - mutex_lock(&ftrace_cmd_mutex); + guard(mutex)(&ftrace_cmd_mutex); + list_for_each_entry(p, &ftrace_commands, list) { - if (strcmp(p->name, command) == 0) { - ret = p->func(tr, hash, func, command, next, enable); - goto out_unlock; - } + if (strcmp(p->name, command) == 0) + return p->func(tr, hash, func, command, next, enable); } - out_unlock: - mutex_unlock(&ftrace_cmd_mutex); - return ret; + return -EINVAL; } static ssize_t @@ -5193,12 +5680,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, parser->idx, enable); trace_parser_clear(parser); if (ret < 0) - goto out; + return ret; } - ret = read; - out: - return ret; + return read; } ssize_t @@ -5230,6 +5715,9 @@ __ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return -ENOENT; free_hash_entry(hash, entry); return 0; + } else if (__ftrace_lookup_ip(hash, ip) != NULL) { + /* Already exists */ + return 0; } entry = add_hash_entry(hash, ip); @@ -5259,7 +5747,7 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long *ips, static int ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, unsigned long *ips, unsigned int cnt, - int remove, int reset, int enable) + int remove, int reset, int enable, char *mod) { struct ftrace_hash **orig_hash; struct ftrace_hash *hash; @@ -5285,7 +5773,15 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, goto out_regex_unlock; } - if (buf && !ftrace_match_records(hash, buf, len)) { + if (buf && !match_records(hash, buf, len, mod)) { + /* If this was for a module and nothing was enabled, flag it */ + if (mod) + (*orig_hash)->flags |= FTRACE_HASH_FL_MOD; + + /* + * Even if it is a mod, return error to let caller know + * nothing was added + */ ret = -EINVAL; goto out_regex_unlock; } @@ -5310,19 +5806,11 @@ static int ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt, int remove, int reset, int enable) { - return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable); + return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable, NULL); } #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS -struct ftrace_direct_func { - struct list_head next; - unsigned long addr; - int count; -}; - -static LIST_HEAD(ftrace_direct_funcs); - static int register_ftrace_function_nolock(struct ftrace_ops *ops); /* @@ -5363,6 +5851,13 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long } } +static void register_ftrace_direct_cb(struct rcu_head *rhp) +{ + struct ftrace_hash *fhp = container_of(rhp, struct ftrace_hash, rcu); + + free_ftrace_hash(fhp); +} + /** * register_ftrace_direct - Call a custom trampoline directly * for multiple functions registered in @ops @@ -5461,10 +5956,8 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) out_unlock: mutex_unlock(&direct_mutex); - if (free_hash && free_hash != EMPTY_HASH) { - synchronize_rcu_tasks(); - free_ftrace_hash(free_hash); - } + if (free_hash && free_hash != EMPTY_HASH) + call_rcu_tasks(&free_hash->rcu, register_ftrace_direct_cb); if (new_hash) free_ftrace_hash(new_hash); @@ -5477,6 +5970,8 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct); * unregister_ftrace_direct - Remove calls to custom trampoline * previously registered by register_ftrace_direct for @ops object. * @ops: The address of the struct ftrace_ops object + * @addr: The address of the direct function that is called by the @ops functions + * @free_filters: Set to true to remove all filters for the ftrace_ops, false otherwise * * This is used to remove a direct calls to @addr from the nop locations * of the functions registered in @ops (with by ftrace_set_filter_ip @@ -5625,10 +6120,10 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct); /** * ftrace_set_filter_ip - set a function to filter on in ftrace by address - * @ops - the ops to set the filter with - * @ip - the address to add to or remove from the filter. - * @remove - non zero to remove the ip from the filter - * @reset - non zero to reset all filters before applying this filter. + * @ops: the ops to set the filter with + * @ip: the address to add to or remove from the filter. + * @remove: non zero to remove the ip from the filter + * @reset: non zero to reset all filters before applying this filter. * * Filters denote which functions should be enabled when tracing is enabled * If @ip is NULL, it fails to update filter. @@ -5647,11 +6142,11 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); /** * ftrace_set_filter_ips - set functions to filter on in ftrace by addresses - * @ops - the ops to set the filter with - * @ips - the array of addresses to add to or remove from the filter. - * @cnt - the number of addresses in @ips - * @remove - non zero to remove ips from the filter - * @reset - non zero to reset all filters before applying this filter. + * @ops: the ops to set the filter with + * @ips: the array of addresses to add to or remove from the filter. + * @cnt: the number of addresses in @ips + * @remove: non zero to remove ips from the filter + * @reset: non zero to reset all filters before applying this filter. * * Filters denote which functions should be enabled when tracing is enabled * If @ips array or any ip specified within is NULL , it fails to update filter. @@ -5670,7 +6165,7 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter_ips); /** * ftrace_ops_set_global_filter - setup ops to use global filters - * @ops - the ops which will use the global filters + * @ops: the ops which will use the global filters * * ftrace users who need global function trace filtering should call this. * It can set the global filter only if ops were not initialized before. @@ -5689,15 +6184,46 @@ static int ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, int reset, int enable) { - return ftrace_set_hash(ops, buf, len, NULL, 0, 0, reset, enable); + char *mod = NULL, *func, *command, *next = buf; + char *tmp __free(kfree) = NULL; + struct trace_array *tr = ops->private; + int ret; + + func = strsep(&next, ":"); + + /* This can also handle :mod: parsing */ + if (next) { + if (!tr) + return -EINVAL; + + command = strsep(&next, ":"); + if (strcmp(command, "mod") != 0) + return -EINVAL; + + mod = next; + len = command - func; + /* Save the original func as ftrace_set_hash() can modify it */ + tmp = kstrdup(func, GFP_KERNEL); + } + + ret = ftrace_set_hash(ops, func, len, NULL, 0, 0, reset, enable, mod); + + if (tr && mod && ret < 0) { + /* Did tmp fail to allocate? */ + if (!tmp) + return -ENOMEM; + ret = cache_mod(tr, tmp, mod, enable); + } + + return ret; } /** * ftrace_set_filter - set a function to filter on in ftrace - * @ops - the ops to set the filter with - * @buf - the string that holds the function filter text. - * @len - the length of the string. - * @reset - non zero to reset all filters before applying this filter. + * @ops: the ops to set the filter with + * @buf: the string that holds the function filter text. + * @len: the length of the string. + * @reset: non-zero to reset all filters before applying this filter. * * Filters denote which functions should be enabled when tracing is enabled. * If @buf is NULL and reset is set, all functions will be enabled for tracing. @@ -5716,10 +6242,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter); /** * ftrace_set_notrace - set a function to not trace in ftrace - * @ops - the ops to set the notrace filter with - * @buf - the string that holds the function notrace text. - * @len - the length of the string. - * @reset - non zero to reset all filters before applying this filter. + * @ops: the ops to set the notrace filter with + * @buf: the string that holds the function notrace text. + * @len: the length of the string. + * @reset: non-zero to reset all filters before applying this filter. * * Notrace Filters denote which functions should not be enabled when tracing * is enabled. If @buf is NULL and reset is set, all functions will be enabled @@ -5738,9 +6264,9 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, EXPORT_SYMBOL_GPL(ftrace_set_notrace); /** * ftrace_set_global_filter - set a function to filter on with global tracers - * @buf - the string that holds the function filter text. - * @len - the length of the string. - * @reset - non zero to reset all filters before applying this filter. + * @buf: the string that holds the function filter text. + * @len: the length of the string. + * @reset: non-zero to reset all filters before applying this filter. * * Filters denote which functions should be enabled when tracing is enabled. * If @buf is NULL and reset is set, all functions will be enabled for tracing. @@ -5753,9 +6279,9 @@ EXPORT_SYMBOL_GPL(ftrace_set_global_filter); /** * ftrace_set_global_notrace - set a function to not trace with global tracers - * @buf - the string that holds the function notrace text. - * @len - the length of the string. - * @reset - non zero to reset all filters before applying this filter. + * @buf: the string that holds the function notrace text. + * @len: the length of the string. + * @reset: non-zero to reset all filters before applying this filter. * * Notrace Filters denote which functions should not be enabled when tracing * is enabled. If @buf is NULL and reset is set, all functions will be enabled @@ -5814,9 +6340,8 @@ __setup("ftrace_graph_notrace=", set_graph_notrace_function); static int __init set_graph_max_depth_function(char *str) { - if (!str) + if (!str || kstrtouint(str, 0, &fgraph_max_depth)) return 0; - fgraph_max_depth = simple_strtoul(str, NULL, 0); return 1; } __setup("ftrace_graph_max_depth=", set_graph_max_depth_function); @@ -5854,6 +6379,14 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable) ftrace_ops_init(ops); + /* The trace_array is needed for caching module function filters */ + if (!ops->private) { + struct trace_array *tr = trace_get_global_array(); + + ops->private = tr; + ftrace_init_trace_array(tr); + } + while (buf) { func = strsep(&buf, ","); ftrace_set_regex(ops, func, strlen(func), 0, enable); @@ -6293,12 +6826,10 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer) func_g.len = strlen(func_g.search); - mutex_lock(&ftrace_lock); + guard(mutex)(&ftrace_lock); - if (unlikely(ftrace_disabled)) { - mutex_unlock(&ftrace_lock); + if (unlikely(ftrace_disabled)) return -ENODEV; - } do_for_each_ftrace_rec(pg, rec) { @@ -6314,7 +6845,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer) if (entry) continue; if (add_hash_entry(hash, rec->ip) == NULL) - goto out; + return 0; } else { if (entry) { free_hash_entry(hash, entry); @@ -6323,13 +6854,8 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer) } } } while_for_each_ftrace_rec(); -out: - mutex_unlock(&ftrace_lock); - - if (fail) - return -EINVAL; - return 0; + return fail ? -EINVAL : 0; } static ssize_t @@ -6593,6 +7119,8 @@ static int ftrace_process_locs(struct module *mod, /* We should have used all pages unless we skipped some */ if (pg_unuse) { WARN_ON(!skipped); + /* Need to synchronize with ftrace_location_range() */ + synchronize_rcu(); ftrace_free_pages(pg_unuse); } return ret; @@ -6806,6 +7334,9 @@ void ftrace_release_mod(struct module *mod) out_unlock: mutex_unlock(&ftrace_lock); + /* Need to synchronize with ftrace_location_range() */ + if (tmp_page) + synchronize_rcu(); for (pg = tmp_page; pg; pg = tmp_page) { /* Needs to be called outside of ftrace_lock */ @@ -6967,7 +7498,7 @@ allocate_ftrace_mod_map(struct module *mod, return mod_map; } -static const char * +static int ftrace_func_address_lookup(struct ftrace_mod_map *mod_map, unsigned long addr, unsigned long *size, unsigned long *off, char *sym) @@ -6988,21 +7519,18 @@ ftrace_func_address_lookup(struct ftrace_mod_map *mod_map, *size = found_func->size; if (off) *off = addr - found_func->ip; - if (sym) - strscpy(sym, found_func->name, KSYM_NAME_LEN); - - return found_func->name; + return strscpy(sym, found_func->name, KSYM_NAME_LEN); } - return NULL; + return 0; } -const char * +int ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) { struct ftrace_mod_map *mod_map; - const char *ret = NULL; + int ret = 0; /* mod_map is freed via call_rcu() */ preempt_disable(); @@ -7139,6 +7667,7 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) unsigned long start = (unsigned long)(start_ptr); unsigned long end = (unsigned long)(end_ptr); struct ftrace_page **last_pg = &ftrace_pages_start; + struct ftrace_page *tmp_page = NULL; struct ftrace_page *pg; struct dyn_ftrace *rec; struct dyn_ftrace key; @@ -7180,12 +7709,8 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) ftrace_update_tot_cnt--; if (!pg->index) { *last_pg = pg->next; - if (pg->records) { - free_pages((unsigned long)pg->records, pg->order); - ftrace_number_of_pages -= 1 << pg->order; - } - ftrace_number_of_groups--; - kfree(pg); + pg->next = tmp_page; + tmp_page = pg; pg = container_of(last_pg, struct ftrace_page, next); if (!(*last_pg)) ftrace_pages = pg; @@ -7202,6 +7727,11 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) clear_func_from_hashes(func); kfree(func); } + /* Need to synchronize with ftrace_location_range() */ + if (tmp_page) { + synchronize_rcu(); + ftrace_free_pages(tmp_page); + } } void __init ftrace_free_init_mem(void) @@ -7290,9 +7820,14 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) void ftrace_init_trace_array(struct trace_array *tr) { + if (tr->flags & TRACE_ARRAY_FL_MOD_INIT) + return; + INIT_LIST_HEAD(&tr->func_probes); INIT_LIST_HEAD(&tr->mod_trace); INIT_LIST_HEAD(&tr->mod_notrace); + + tr->flags |= TRACE_ARRAY_FL_MOD_INIT; } #else @@ -7321,8 +7856,10 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) __init void ftrace_init_global_array_ops(struct trace_array *tr) { tr->ops = &global_ops; - tr->ops->private = tr; + if (!global_ops.private) + global_ops.private = tr; ftrace_init_trace_array(tr); + init_array_fgraph_ops(tr, tr->ops); } void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func) @@ -7403,6 +7940,7 @@ out: void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { + kmsan_unpoison_memory(fregs, ftrace_regs_size()); __ftrace_ops_list_func(ip, parent_ip, NULL, fregs); } #else @@ -7443,7 +7981,7 @@ NOKPROBE_SYMBOL(ftrace_ops_assist_func); * have its own recursion protection, then it should call the * ftrace_ops_assist_func() instead. * - * Returns the function that the trampoline should call for @ops. + * Returns: the function that the trampoline should call for @ops. */ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) { @@ -7761,7 +8299,7 @@ pid_write(struct file *filp, const char __user *ubuf, if (!cnt) return 0; - mutex_lock(&ftrace_lock); + guard(mutex)(&ftrace_lock); switch (type) { case TRACE_PIDS: @@ -7777,14 +8315,13 @@ pid_write(struct file *filp, const char __user *ubuf, lockdep_is_held(&ftrace_lock)); break; default: - ret = -EINVAL; WARN_ON_ONCE(1); - goto out; + return -EINVAL; } ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); if (ret < 0) - goto out; + return ret; switch (type) { case TRACE_PIDS: @@ -7813,11 +8350,8 @@ pid_write(struct file *filp, const char __user *ubuf, ftrace_update_pid_func(); ftrace_startup_all(0); - out: - mutex_unlock(&ftrace_lock); - if (ret > 0) - *ppos += ret; + *ppos += ret; return ret; } @@ -7892,12 +8426,13 @@ void ftrace_kill(void) ftrace_disabled = 1; ftrace_enabled = 0; ftrace_trace_function = ftrace_stub; + kprobe_ftrace_kill(); } /** * ftrace_is_dead - Test if ftrace is dead or not. * - * Returns 1 if ftrace is "dead", zero otherwise. + * Returns: 1 if ftrace is "dead", zero otherwise. */ int ftrace_is_dead(void) { @@ -8142,8 +8677,7 @@ static int kallsyms_callback(void *data, const char *name, unsigned long addr) * @addrs array, which needs to be big enough to store at least @cnt * addresses. * - * This function returns 0 if all provided symbols are found, - * -ESRCH otherwise. + * Returns: 0 if all provided symbols are found, -ESRCH otherwise. */ int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs) { @@ -8217,20 +8751,20 @@ static bool is_permanent_ops_registered(void) } static int -ftrace_enable_sysctl(struct ctl_table *table, int write, +ftrace_enable_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - int ret = -ENODEV; + int ret; - mutex_lock(&ftrace_lock); + guard(mutex)(&ftrace_lock); if (unlikely(ftrace_disabled)) - goto out; + return -ENODEV; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) - goto out; + return ret; if (ftrace_enabled) { @@ -8244,8 +8778,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, } else { if (is_permanent_ops_registered()) { ftrace_enabled = true; - ret = -EBUSY; - goto out; + return -EBUSY; } /* stopping ftrace calls (just send to ftrace_stub) */ @@ -8255,12 +8788,10 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, } last_ftrace_enabled = !!ftrace_enabled; - out: - mutex_unlock(&ftrace_lock); - return ret; + return 0; } -static struct ctl_table ftrace_sysctls[] = { +static const struct ctl_table ftrace_sysctls[] = { { .procname = "ftrace_enabled", .data = &ftrace_enabled, @@ -8268,7 +8799,6 @@ static struct ctl_table ftrace_sysctls[] = { .mode = 0644, .proc_handler = ftrace_enable_sysctl, }, - {} }; static int __init ftrace_sysctl_init(void) diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h index 5012c04f92c0..3235470e61b3 100644 --- a/kernel/trace/ftrace_internal.h +++ b/kernel/trace/ftrace_internal.h @@ -15,6 +15,8 @@ extern struct ftrace_ops global_ops; int ftrace_startup(struct ftrace_ops *ops, int command); int ftrace_shutdown(struct ftrace_ops *ops, int command); int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs); +int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command); +int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command); #else /* !CONFIG_DYNAMIC_FTRACE */ @@ -38,14 +40,26 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) { return 1; } +static inline int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command) +{ + return -EINVAL; +} +static inline int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command) +{ + return -EINVAL; +} #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER extern int ftrace_graph_active; -void update_function_graph_func(void); +# ifdef CONFIG_DYNAMIC_FTRACE +extern void fgraph_update_pid_func(void); +# else +static inline void fgraph_update_pid_func(void) {} +# endif #else /* !CONFIG_FUNCTION_GRAPH_TRACER */ # define ftrace_graph_active 0 -static inline void update_function_graph_func(void) { } +static inline void fgraph_update_pid_func(void) {} #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ #else /* !CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index 95106d02b32d..c62b9b3cfb3d 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -354,7 +354,7 @@ static void pid_list_refill_irq(struct irq_work *iwork) while (upper_count-- > 0) { union upper_chunk *chunk; - chunk = kzalloc(sizeof(*chunk), GFP_KERNEL); + chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT); if (!chunk) break; *upper_next = chunk; @@ -365,7 +365,7 @@ static void pid_list_refill_irq(struct irq_work *iwork) while (lower_count-- > 0) { union lower_chunk *chunk; - chunk = kzalloc(sizeof(*chunk), GFP_KERNEL); + chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT); if (!chunk) break; *lower_next = chunk; @@ -414,7 +414,7 @@ struct trace_pid_list *trace_pid_list_alloc(void) int i; /* According to linux/thread.h, pids can be no bigger that 30 bits */ - WARN_ON_ONCE(pid_max > (1 << 30)); + WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30)); pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL); if (!pid_list) @@ -451,6 +451,7 @@ struct trace_pid_list *trace_pid_list_alloc(void) /** * trace_pid_list_free - Frees an allocated pid_list. + * @pid_list: The pid list to free. * * Frees the memory for a pid_list that was allocated. */ diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index 8c4ffd076162..314ffc143039 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -34,8 +34,6 @@ MODULE_PARM_DESC(cpu_affinity, "Cpu num test is running on"); static struct completion done; -#define MIN(x, y) ((x) < (y) ? (x) : (y)) - static void busy_wait(ulong time) { u64 start, end; @@ -215,4 +213,5 @@ static void __exit preemptirq_delay_exit(void) module_init(preemptirq_delay_init) module_exit(preemptirq_delay_exit) +MODULE_DESCRIPTION("Preempt / IRQ disable delay thread to test latency tracers"); MODULE_LICENSE("GPL v2"); diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index fa03094e9e69..30d224946881 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -166,6 +166,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh) if (unlikely(!handler)) return NULL; +#if defined(CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING) || defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE) /* * This expects the caller will set up a rethook on a function entry. * When the function returns, the rethook will eventually be reclaimed @@ -174,6 +175,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh) */ if (unlikely(!rcu_is_watching())) return NULL; +#endif return (struct rethook_node *)objpool_pop(&rh->pool); } @@ -248,7 +250,7 @@ unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame if (WARN_ON_ONCE(!cur)) return 0; - if (WARN_ON_ONCE(tsk != current && task_is_running(tsk))) + if (tsk != current && task_is_running(tsk)) return 0; do { diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index aa332ace108b..bb6089c2951e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -9,6 +9,7 @@ #include <linux/ring_buffer.h> #include <linux/trace_clock.h> #include <linux/sched/clock.h> +#include <linux/cacheflush.h> #include <linux/trace_seq.h> #include <linux/spinlock.h> #include <linux/irq_work.h> @@ -26,10 +27,13 @@ #include <linux/list.h> #include <linux/cpu.h> #include <linux/oom.h> +#include <linux/mm.h> #include <asm/local64.h> #include <asm/local.h> +#include "trace.h" + /* * The "absolute" timestamp in the buffer is only 59 bits. * If a clock has the 5 MSBs set, it needs to be saved and @@ -40,6 +44,21 @@ static void update_pages_handler(struct work_struct *work); +#define RING_BUFFER_META_MAGIC 0xBADFEED + +struct ring_buffer_meta { + int magic; + int struct_size; + unsigned long text_addr; + unsigned long data_addr; + unsigned long first_buffer; + unsigned long head_buffer; + unsigned long commit_buffer; + __u32 subbuf_size; + __u32 nr_subbufs; + int buffers[]; +}; + /* * The ring buffer header is special. We must manually up keep it. */ @@ -312,6 +331,8 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event) /* Missed count stored at end */ #define RB_MISSED_STORED (1 << 30) +#define RB_MISSED_MASK (3 << 30) + struct buffer_data_page { u64 time_stamp; /* page time stamp */ local_t commit; /* write committed index */ @@ -338,6 +359,8 @@ struct buffer_page { local_t entries; /* entries on this page */ unsigned long real_end; /* real end of data */ unsigned order; /* order of the page */ + u32 id:30; /* ID for external mapping */ + u32 range:1; /* Mapped via a range */ struct buffer_data_page *page; /* Actual data page */ }; @@ -368,7 +391,9 @@ static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage) static void free_buffer_page(struct buffer_page *bpage) { - free_pages((unsigned long)bpage->page, bpage->order); + /* Range pages are not to be freed */ + if (!bpage->range) + free_pages((unsigned long)bpage->page, bpage->order); kfree(bpage); } @@ -384,6 +409,7 @@ struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; wait_queue_head_t full_waiters; + atomic_t seq; bool waiters_pending; bool full_waiters_pending; bool wakeup_full; @@ -456,6 +482,8 @@ struct ring_buffer_per_cpu { unsigned long nr_pages; unsigned int current_context; struct list_head *pages; + /* pages generation counter, incremented when the list changes */ + unsigned long cnt; struct buffer_page *head_page; /* read from head */ struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ @@ -483,6 +511,14 @@ struct ring_buffer_per_cpu { u64 read_stamp; /* pages removed since last reset */ unsigned long pages_removed; + + unsigned int mapped; + unsigned int user_mapped; /* user space mapping */ + struct mutex mapping_lock; + unsigned long *subbuf_ids; /* ID to subbuf VA */ + struct trace_buffer_meta *meta_page; + struct ring_buffer_meta *ring_meta; + /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ @@ -511,6 +547,12 @@ struct trace_buffer { struct rb_irq_work irq_work; bool time_stamp_abs; + unsigned long range_addr_start; + unsigned long range_addr_end; + + long last_text_delta; + long last_data_delta; + unsigned int subbuf_size; unsigned int subbuf_order; unsigned int max_data_size; @@ -681,18 +723,6 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, } /** - * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer - * @buffer: The ring_buffer to get the number of pages from - * @cpu: The cpu of the ring_buffer to get the number of pages from - * - * Returns the number of pages used by a per_cpu buffer of the ring buffer. - */ -size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu) -{ - return buffer->buffers[cpu]->nr_pages; -} - -/** * ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer * @buffer: The ring_buffer to get the number of pages from * @cpu: The cpu of the ring_buffer to get the number of pages from @@ -753,6 +783,9 @@ static void rb_wake_up_waiters(struct irq_work *work) { struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); + /* For waiters waiting for the first wake up */ + (void)atomic_fetch_inc_release(&rbwork->seq); + wake_up_all(&rbwork->waiters); if (rbwork->full_waiters_pending || rbwork->wakeup_full) { /* Only cpu_buffer sets the above flags */ @@ -834,51 +867,24 @@ static bool rb_watermark_hit(struct trace_buffer *buffer, int cpu, int full) pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; ret = !pagebusy && full_hit(buffer, cpu, full); - if (!cpu_buffer->shortest_full || - cpu_buffer->shortest_full > full) - cpu_buffer->shortest_full = full; + if (!ret && (!cpu_buffer->shortest_full || + cpu_buffer->shortest_full > full)) { + cpu_buffer->shortest_full = full; + } raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } return ret; } -/** - * ring_buffer_wait - wait for input to the ring buffer - * @buffer: buffer to wait on - * @cpu: the cpu buffer to wait on - * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS - * - * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon - * as data is added to any of the @buffer's cpu buffers. Otherwise - * it will wait for data to be added to a specific cpu buffer. - */ -int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) +static inline bool +rb_wait_cond(struct rb_irq_work *rbwork, struct trace_buffer *buffer, + int cpu, int full, ring_buffer_cond_fn cond, void *data) { - struct ring_buffer_per_cpu *cpu_buffer; - DEFINE_WAIT(wait); - struct rb_irq_work *work; - int ret = 0; - - /* - * Depending on what the caller is waiting for, either any - * data in any cpu buffer, or a specific buffer, put the - * caller on the appropriate wait queue. - */ - if (cpu == RING_BUFFER_ALL_CPUS) { - work = &buffer->irq_work; - /* Full only makes sense on per cpu reads */ - full = 0; - } else { - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return -ENODEV; - cpu_buffer = buffer->buffers[cpu]; - work = &cpu_buffer->irq_work; - } + if (rb_watermark_hit(buffer, cpu, full)) + return true; - if (full) - prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE); - else - prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); + if (cond(data)) + return true; /* * The events can happen in critical sections where @@ -901,27 +907,82 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) * a task has been queued. It's OK for spurious wake ups. */ if (full) - work->full_waiters_pending = true; + rbwork->full_waiters_pending = true; else - work->waiters_pending = true; + rbwork->waiters_pending = true; - if (rb_watermark_hit(buffer, cpu, full)) - goto out; + return false; +} - if (signal_pending(current)) { - ret = -EINTR; - goto out; +struct rb_wait_data { + struct rb_irq_work *irq_work; + int seq; +}; + +/* + * The default wait condition for ring_buffer_wait() is to just to exit the + * wait loop the first time it is woken up. + */ +static bool rb_wait_once(void *data) +{ + struct rb_wait_data *rdata = data; + struct rb_irq_work *rbwork = rdata->irq_work; + + return atomic_read_acquire(&rbwork->seq) != rdata->seq; +} + +/** + * ring_buffer_wait - wait for input to the ring buffer + * @buffer: buffer to wait on + * @cpu: the cpu buffer to wait on + * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS + * @cond: condition function to break out of wait (NULL to run once) + * @data: the data to pass to @cond. + * + * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon + * as data is added to any of the @buffer's cpu buffers. Otherwise + * it will wait for data to be added to a specific cpu buffer. + */ +int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full, + ring_buffer_cond_fn cond, void *data) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct wait_queue_head *waitq; + struct rb_irq_work *rbwork; + struct rb_wait_data rdata; + int ret = 0; + + /* + * Depending on what the caller is waiting for, either any + * data in any cpu buffer, or a specific buffer, put the + * caller on the appropriate wait queue. + */ + if (cpu == RING_BUFFER_ALL_CPUS) { + rbwork = &buffer->irq_work; + /* Full only makes sense on per cpu reads */ + full = 0; + } else { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -ENODEV; + cpu_buffer = buffer->buffers[cpu]; + rbwork = &cpu_buffer->irq_work; } - schedule(); - out: if (full) - finish_wait(&work->full_waiters, &wait); + waitq = &rbwork->full_waiters; else - finish_wait(&work->waiters, &wait); + waitq = &rbwork->waiters; + + /* Set up to exit loop as soon as it is woken */ + if (!cond) { + cond = rb_wait_once; + rdata.irq_work = rbwork; + rdata.seq = atomic_read_acquire(&rbwork->seq); + data = &rdata; + } - if (!ret && !rb_watermark_hit(buffer, cpu, full) && signal_pending(current)) - ret = -EINTR; + ret = wait_event_interruptible((*waitq), + rb_wait_cond(rbwork, buffer, cpu, full, cond, data)); return ret; } @@ -959,21 +1020,30 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, } if (full) { - unsigned long flags; - poll_wait(filp, &rbwork->full_waiters, poll_table); - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + if (rb_watermark_hit(buffer, cpu, full)) + return EPOLLIN | EPOLLRDNORM; + /* + * Only allow full_waiters_pending update to be seen after + * the shortest_full is set (in rb_watermark_hit). If the + * writer sees the full_waiters_pending flag set, it will + * compare the amount in the ring buffer to shortest_full. + * If the amount in the ring buffer is greater than the + * shortest_full percent, it will call the irq_work handler + * to wake up this list. The irq_handler will reset shortest_full + * back to zero. That's done under the reader_lock, but + * the below smp_mb() makes sure that the update to + * full_waiters_pending doesn't leak up into the above. + */ + smp_mb(); rbwork->full_waiters_pending = true; - if (!cpu_buffer->shortest_full || - cpu_buffer->shortest_full > full) - cpu_buffer->shortest_full = full; - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - } else { - poll_wait(filp, &rbwork->waiters, poll_table); - rbwork->waiters_pending = true; + return 0; } + poll_wait(filp, &rbwork->waiters, poll_table); + rbwork->waiters_pending = true; + /* * There's a tight race between setting the waiters_pending and * checking if the ring buffer is empty. Once the waiters_pending bit @@ -989,9 +1059,6 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, */ smp_mb(); - if (full) - return full_hit(buffer, cpu, full) ? EPOLLIN | EPOLLRDNORM : 0; - if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) return EPOLLIN | EPOLLRDNORM; @@ -1022,7 +1089,7 @@ static inline u64 rb_time_stamp(struct trace_buffer *buffer) u64 ts; /* Skip retpolines :-( */ - if (IS_ENABLED(CONFIG_RETPOLINE) && likely(buffer->clock == trace_clock_local)) + if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && likely(buffer->clock == trace_clock_local)) ts = trace_clock_local(); else ts = buffer->clock(); @@ -1202,6 +1269,11 @@ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) * Set the previous list pointer to have the HEAD flag. */ rb_set_list_to_head(head->list.prev); + + if (cpu_buffer->ring_meta) { + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + meta->head_buffer = (unsigned long)head->page; + } } static void rb_list_head_clear(struct list_head *list) @@ -1355,7 +1427,6 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write); old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries); - local_inc(&cpu_buffer->pages_touched); /* * Just make sure we have seen our old_write and synchronize * with any interrupts that come in. @@ -1392,8 +1463,9 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, */ local_set(&next_page->page->commit, 0); - /* Again, either we update tail_page or an interrupt does */ - (void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page); + /* Either we update tail_page or an interrupt does */ + if (try_cmpxchg(&cpu_buffer->tail_page, &tail_page, next_page)) + local_inc(&cpu_buffer->pages_touched); } } @@ -1405,6 +1477,20 @@ static void rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK); } +static bool rb_check_links(struct ring_buffer_per_cpu *cpu_buffer, + struct list_head *list) +{ + if (RB_WARN_ON(cpu_buffer, + rb_list_head(rb_list_head(list->next)->prev) != list)) + return false; + + if (RB_WARN_ON(cpu_buffer, + rb_list_head(rb_list_head(list->prev)->next) != list)) + return false; + + return true; +} + /** * rb_check_pages - integrity check of buffer pages * @cpu_buffer: CPU buffer with pages to test @@ -1414,31 +1500,567 @@ static void rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, */ static void rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) { - struct list_head *head = rb_list_head(cpu_buffer->pages); - struct list_head *tmp; + struct list_head *head, *tmp; + unsigned long buffer_cnt; + unsigned long flags; + int nr_loops = 0; - if (RB_WARN_ON(cpu_buffer, - rb_list_head(rb_list_head(head->next)->prev) != head)) + /* + * Walk the linked list underpinning the ring buffer and validate all + * its next and prev links. + * + * The check acquires the reader_lock to avoid concurrent processing + * with code that could be modifying the list. However, the lock cannot + * be held for the entire duration of the walk, as this would make the + * time when interrupts are disabled non-deterministic, dependent on the + * ring buffer size. Therefore, the code releases and re-acquires the + * lock after checking each page. The ring_buffer_per_cpu.cnt variable + * is then used to detect if the list was modified while the lock was + * not held, in which case the check needs to be restarted. + * + * The code attempts to perform the check at most three times before + * giving up. This is acceptable because this is only a self-validation + * to detect problems early on. In practice, the list modification + * operations are fairly spaced, and so this check typically succeeds at + * most on the second try. + */ +again: + if (++nr_loops > 3) return; - if (RB_WARN_ON(cpu_buffer, - rb_list_head(rb_list_head(head->prev)->next) != head)) + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + head = rb_list_head(cpu_buffer->pages); + if (!rb_check_links(cpu_buffer, head)) + goto out_locked; + buffer_cnt = cpu_buffer->cnt; + tmp = head; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + while (true) { + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + if (buffer_cnt != cpu_buffer->cnt) { + /* The list was updated, try again. */ + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + goto again; + } + + tmp = rb_list_head(tmp->next); + if (tmp == head) + /* The iteration circled back, all is done. */ + goto out_locked; + + if (!rb_check_links(cpu_buffer, tmp)) + goto out_locked; + + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + } + +out_locked: + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); +} + +/* + * Take an address, add the meta data size as well as the array of + * array subbuffer indexes, then align it to a subbuffer size. + * + * This is used to help find the next per cpu subbuffer within a mapped range. + */ +static unsigned long +rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs) +{ + addr += sizeof(struct ring_buffer_meta) + + sizeof(int) * nr_subbufs; + return ALIGN(addr, subbuf_size); +} + +/* + * Return the ring_buffer_meta for a given @cpu. + */ +static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu) +{ + int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE; + unsigned long ptr = buffer->range_addr_start; + struct ring_buffer_meta *meta; + int nr_subbufs; + + if (!ptr) + return NULL; + + /* When nr_pages passed in is zero, the first meta has already been initialized */ + if (!nr_pages) { + meta = (struct ring_buffer_meta *)ptr; + nr_subbufs = meta->nr_subbufs; + } else { + meta = NULL; + /* Include the reader page */ + nr_subbufs = nr_pages + 1; + } + + /* + * The first chunk may not be subbuffer aligned, where as + * the rest of the chunks are. + */ + if (cpu) { + ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs); + ptr += subbuf_size * nr_subbufs; + + /* We can use multiplication to find chunks greater than 1 */ + if (cpu > 1) { + unsigned long size; + unsigned long p; + + /* Save the beginning of this CPU chunk */ + p = ptr; + ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs); + ptr += subbuf_size * nr_subbufs; + + /* Now all chunks after this are the same size */ + size = ptr - p; + ptr += size * (cpu - 2); + } + } + return (void *)ptr; +} + +/* Return the start of subbufs given the meta pointer */ +static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta) +{ + int subbuf_size = meta->subbuf_size; + unsigned long ptr; + + ptr = (unsigned long)meta; + ptr = rb_range_align_subbuf(ptr, subbuf_size, meta->nr_subbufs); + + return (void *)ptr; +} + +/* + * Return a specific sub-buffer for a given @cpu defined by @idx. + */ +static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) +{ + struct ring_buffer_meta *meta; + unsigned long ptr; + int subbuf_size; + + meta = rb_range_meta(cpu_buffer->buffer, 0, cpu_buffer->cpu); + if (!meta) + return NULL; + + if (WARN_ON_ONCE(idx >= meta->nr_subbufs)) + return NULL; + + subbuf_size = meta->subbuf_size; + + /* Map this buffer to the order that's in meta->buffers[] */ + idx = meta->buffers[idx]; + + ptr = (unsigned long)rb_subbufs_from_meta(meta); + + ptr += subbuf_size * idx; + if (ptr + subbuf_size > cpu_buffer->buffer->range_addr_end) + return NULL; + + return (void *)ptr; +} + +/* + * See if the existing memory contains valid ring buffer data. + * As the previous kernel must be the same as this kernel, all + * the calculations (size of buffers and number of buffers) + * must be the same. + */ +static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, + struct trace_buffer *buffer, int nr_pages, + unsigned long *subbuf_mask) +{ + int subbuf_size = PAGE_SIZE; + struct buffer_data_page *subbuf; + unsigned long buffers_start; + unsigned long buffers_end; + int i; + + if (!subbuf_mask) + return false; + + /* Check the meta magic and meta struct size */ + if (meta->magic != RING_BUFFER_META_MAGIC || + meta->struct_size != sizeof(*meta)) { + pr_info("Ring buffer boot meta[%d] mismatch of magic or struct size\n", cpu); + return false; + } + + /* The subbuffer's size and number of subbuffers must match */ + if (meta->subbuf_size != subbuf_size || + meta->nr_subbufs != nr_pages + 1) { + pr_info("Ring buffer boot meta [%d] mismatch of subbuf_size/nr_pages\n", cpu); + return false; + } + + buffers_start = meta->first_buffer; + buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs); + + /* Is the head and commit buffers within the range of buffers? */ + if (meta->head_buffer < buffers_start || + meta->head_buffer >= buffers_end) { + pr_info("Ring buffer boot meta [%d] head buffer out of range\n", cpu); + return false; + } + + if (meta->commit_buffer < buffers_start || + meta->commit_buffer >= buffers_end) { + pr_info("Ring buffer boot meta [%d] commit buffer out of range\n", cpu); + return false; + } + + subbuf = rb_subbufs_from_meta(meta); + + bitmap_clear(subbuf_mask, 0, meta->nr_subbufs); + + /* Is the meta buffers and the subbufs themselves have correct data? */ + for (i = 0; i < meta->nr_subbufs; i++) { + if (meta->buffers[i] < 0 || + meta->buffers[i] >= meta->nr_subbufs) { + pr_info("Ring buffer boot meta [%d] array out of range\n", cpu); + return false; + } + + if ((unsigned)local_read(&subbuf->commit) > subbuf_size) { + pr_info("Ring buffer boot meta [%d] buffer invalid commit\n", cpu); + return false; + } + + if (test_bit(meta->buffers[i], subbuf_mask)) { + pr_info("Ring buffer boot meta [%d] array has duplicates\n", cpu); + return false; + } + + set_bit(meta->buffers[i], subbuf_mask); + subbuf = (void *)subbuf + subbuf_size; + } + + return true; +} + +static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf); + +static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu, + unsigned long long *timestamp, u64 *delta_ptr) +{ + struct ring_buffer_event *event; + u64 ts, delta; + int events = 0; + int e; + + *delta_ptr = 0; + *timestamp = 0; + + ts = dpage->time_stamp; + + for (e = 0; e < tail; e += rb_event_length(event)) { + + event = (struct ring_buffer_event *)(dpage->data + e); + + switch (event->type_len) { + + case RINGBUF_TYPE_TIME_EXTEND: + delta = rb_event_time_stamp(event); + ts += delta; + break; + + case RINGBUF_TYPE_TIME_STAMP: + delta = rb_event_time_stamp(event); + delta = rb_fix_abs_ts(delta, ts); + if (delta < ts) { + *delta_ptr = delta; + *timestamp = ts; + return -1; + } + ts = delta; + break; + + case RINGBUF_TYPE_PADDING: + if (event->time_delta == 1) + break; + fallthrough; + case RINGBUF_TYPE_DATA: + events++; + ts += event->time_delta; + break; + + default: + return -1; + } + } + *timestamp = ts; + return events; +} + +static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu) +{ + unsigned long long ts; + u64 delta; + int tail; + + tail = local_read(&dpage->commit); + return rb_read_data_buffer(dpage, tail, cpu, &ts, &delta); +} + +/* If the meta data has been validated, now validate the events */ +static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct buffer_page *head_page; + unsigned long entry_bytes = 0; + unsigned long entries = 0; + int ret; + int i; + + if (!meta || !meta->head_buffer) return; - for (tmp = rb_list_head(head->next); tmp != head; tmp = rb_list_head(tmp->next)) { - if (RB_WARN_ON(cpu_buffer, - rb_list_head(rb_list_head(tmp->next)->prev) != tmp)) - return; + /* Do the reader page first */ + ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu); + if (ret < 0) { + pr_info("Ring buffer reader page is invalid\n"); + goto invalid; + } + entries += ret; + entry_bytes += local_read(&cpu_buffer->reader_page->page->commit); + local_set(&cpu_buffer->reader_page->entries, ret); - if (RB_WARN_ON(cpu_buffer, - rb_list_head(rb_list_head(tmp->prev)->next) != tmp)) - return; + head_page = cpu_buffer->head_page; + + /* If both the head and commit are on the reader_page then we are done. */ + if (head_page == cpu_buffer->reader_page && + head_page == cpu_buffer->commit_page) + goto done; + + /* Iterate until finding the commit page */ + for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) { + + /* Reader page has already been done */ + if (head_page == cpu_buffer->reader_page) + continue; + + ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu); + if (ret < 0) { + pr_info("Ring buffer meta [%d] invalid buffer page\n", + cpu_buffer->cpu); + goto invalid; + } + + /* If the buffer has content, update pages_touched */ + if (ret) + local_inc(&cpu_buffer->pages_touched); + + entries += ret; + entry_bytes += local_read(&head_page->page->commit); + local_set(&cpu_buffer->head_page->entries, ret); + + if (head_page == cpu_buffer->commit_page) + break; + } + + if (head_page != cpu_buffer->commit_page) { + pr_info("Ring buffer meta [%d] commit page not found\n", + cpu_buffer->cpu); + goto invalid; + } + done: + local_set(&cpu_buffer->entries, entries); + local_set(&cpu_buffer->entries_bytes, entry_bytes); + + pr_info("Ring buffer meta [%d] is from previous boot!\n", cpu_buffer->cpu); + return; + + invalid: + /* The content of the buffers are invalid, reset the meta data */ + meta->head_buffer = 0; + meta->commit_buffer = 0; + + /* Reset the reader page */ + local_set(&cpu_buffer->reader_page->entries, 0); + local_set(&cpu_buffer->reader_page->page->commit, 0); + + /* Reset all the subbuffers */ + for (i = 0; i < meta->nr_subbufs - 1; i++, rb_inc_page(&head_page)) { + local_set(&head_page->entries, 0); + local_set(&head_page->page->commit, 0); + } +} + +/* Used to calculate data delta */ +static char rb_data_ptr[] = ""; + +#define THIS_TEXT_PTR ((unsigned long)rb_meta_init_text_addr) +#define THIS_DATA_PTR ((unsigned long)rb_data_ptr) + +static void rb_meta_init_text_addr(struct ring_buffer_meta *meta) +{ + meta->text_addr = THIS_TEXT_PTR; + meta->data_addr = THIS_DATA_PTR; +} + +static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) +{ + struct ring_buffer_meta *meta; + unsigned long *subbuf_mask; + unsigned long delta; + void *subbuf; + int cpu; + int i; + + /* Create a mask to test the subbuf array */ + subbuf_mask = bitmap_alloc(nr_pages + 1, GFP_KERNEL); + /* If subbuf_mask fails to allocate, then rb_meta_valid() will return false */ + + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { + void *next_meta; + + meta = rb_range_meta(buffer, nr_pages, cpu); + + if (rb_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) { + /* Make the mappings match the current address */ + subbuf = rb_subbufs_from_meta(meta); + delta = (unsigned long)subbuf - meta->first_buffer; + meta->first_buffer += delta; + meta->head_buffer += delta; + meta->commit_buffer += delta; + buffer->last_text_delta = THIS_TEXT_PTR - meta->text_addr; + buffer->last_data_delta = THIS_DATA_PTR - meta->data_addr; + continue; + } + + if (cpu < nr_cpu_ids - 1) + next_meta = rb_range_meta(buffer, nr_pages, cpu + 1); + else + next_meta = (void *)buffer->range_addr_end; + + memset(meta, 0, next_meta - (void *)meta); + + meta->magic = RING_BUFFER_META_MAGIC; + meta->struct_size = sizeof(*meta); + + meta->nr_subbufs = nr_pages + 1; + meta->subbuf_size = PAGE_SIZE; + + subbuf = rb_subbufs_from_meta(meta); + + meta->first_buffer = (unsigned long)subbuf; + rb_meta_init_text_addr(meta); + + /* + * The buffers[] array holds the order of the sub-buffers + * that are after the meta data. The sub-buffers may + * be swapped out when read and inserted into a different + * location of the ring buffer. Although their addresses + * remain the same, the buffers[] array contains the + * index into the sub-buffers holding their actual order. + */ + for (i = 0; i < meta->nr_subbufs; i++) { + meta->buffers[i] = i; + rb_init_page(subbuf); + subbuf += meta->subbuf_size; + } + } + bitmap_free(subbuf_mask); +} + +static void *rbm_start(struct seq_file *m, loff_t *pos) +{ + struct ring_buffer_per_cpu *cpu_buffer = m->private; + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + unsigned long val; + + if (!meta) + return NULL; + + if (*pos > meta->nr_subbufs) + return NULL; + + val = *pos; + val++; + + return (void *)val; +} + +static void *rbm_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + + return rbm_start(m, pos); +} + +static int rbm_show(struct seq_file *m, void *v) +{ + struct ring_buffer_per_cpu *cpu_buffer = m->private; + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + unsigned long val = (unsigned long)v; + + if (val == 1) { + seq_printf(m, "head_buffer: %d\n", + rb_meta_subbuf_idx(meta, (void *)meta->head_buffer)); + seq_printf(m, "commit_buffer: %d\n", + rb_meta_subbuf_idx(meta, (void *)meta->commit_buffer)); + seq_printf(m, "subbuf_size: %d\n", meta->subbuf_size); + seq_printf(m, "nr_subbufs: %d\n", meta->nr_subbufs); + return 0; + } + + val -= 2; + seq_printf(m, "buffer[%ld]: %d\n", val, meta->buffers[val]); + + return 0; +} + +static void rbm_stop(struct seq_file *m, void *p) +{ +} + +static const struct seq_operations rb_meta_seq_ops = { + .start = rbm_start, + .next = rbm_next, + .show = rbm_show, + .stop = rbm_stop, +}; + +int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, int cpu) +{ + struct seq_file *m; + int ret; + + ret = seq_open(file, &rb_meta_seq_ops); + if (ret) + return ret; + + m = file->private_data; + m->private = buffer->buffers[cpu]; + + return 0; +} + +/* Map the buffer_pages to the previous head and commit pages */ +static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *bpage) +{ + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + + if (meta->head_buffer == (unsigned long)bpage->page) + cpu_buffer->head_page = bpage; + + if (meta->commit_buffer == (unsigned long)bpage->page) { + cpu_buffer->commit_page = bpage; + cpu_buffer->tail_page = bpage; } } static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { + struct trace_buffer *buffer = cpu_buffer->buffer; + struct ring_buffer_meta *meta = NULL; struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; gfp_t mflags; @@ -1473,6 +2095,10 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, */ if (user_thread) set_current_oom_origin(); + + if (buffer->range_addr_start) + meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu); + for (i = 0; i < nr_pages; i++) { struct page *page; @@ -1483,15 +2109,32 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_check_bpage(cpu_buffer, bpage); - list_add(&bpage->list, pages); - - page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, - cpu_buffer->buffer->subbuf_order); - if (!page) - goto free_pages; - bpage->page = page_address(page); + /* + * Append the pages as for mapped buffers we want to keep + * the order + */ + list_add_tail(&bpage->list, pages); + + if (meta) { + /* A range was given. Use that for the buffer page */ + bpage->page = rb_range_buffer(cpu_buffer, i + 1); + if (!bpage->page) + goto free_pages; + /* If this is valid from a previous boot */ + if (meta->head_buffer) + rb_meta_buffer_update(cpu_buffer, bpage); + bpage->range = 1; + bpage->id = i + 1; + } else { + page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), + mflags | __GFP_COMP | __GFP_ZERO, + cpu_buffer->buffer->subbuf_order); + if (!page) + goto free_pages; + bpage->page = page_address(page); + rb_init_page(bpage->page); + } bpage->order = cpu_buffer->buffer->subbuf_order; - rb_init_page(bpage->page); if (user_thread && fatal_signal_pending(current)) goto free_pages; @@ -1541,6 +2184,7 @@ static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_meta *meta; struct buffer_page *bpage; struct page *page; int ret; @@ -1560,6 +2204,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); init_waitqueue_head(&cpu_buffer->irq_work.waiters); init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); + mutex_init(&cpu_buffer->mapping_lock); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); @@ -1570,11 +2215,28 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) cpu_buffer->reader_page = bpage; - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, cpu_buffer->buffer->subbuf_order); - if (!page) - goto fail_free_reader; - bpage->page = page_address(page); - rb_init_page(bpage->page); + if (buffer->range_addr_start) { + /* + * Range mapped buffers have the same restrictions as memory + * mapped ones do. + */ + cpu_buffer->mapped = 1; + cpu_buffer->ring_meta = rb_range_meta(buffer, nr_pages, cpu); + bpage->page = rb_range_buffer(cpu_buffer, 0); + if (!bpage->page) + goto fail_free_reader; + if (cpu_buffer->ring_meta->head_buffer) + rb_meta_buffer_update(cpu_buffer, bpage); + bpage->range = 1; + } else { + page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_COMP | __GFP_ZERO, + cpu_buffer->buffer->subbuf_order); + if (!page) + goto fail_free_reader; + bpage->page = page_address(page); + rb_init_page(bpage->page); + } INIT_LIST_HEAD(&cpu_buffer->reader_page->list); INIT_LIST_HEAD(&cpu_buffer->new_pages); @@ -1583,11 +2245,35 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) if (ret < 0) goto fail_free_reader; - cpu_buffer->head_page - = list_entry(cpu_buffer->pages, struct buffer_page, list); - cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; + rb_meta_validate_events(cpu_buffer); + + /* If the boot meta was valid then this has already been updated */ + meta = cpu_buffer->ring_meta; + if (!meta || !meta->head_buffer || + !cpu_buffer->head_page || !cpu_buffer->commit_page || !cpu_buffer->tail_page) { + if (meta && meta->head_buffer && + (cpu_buffer->head_page || cpu_buffer->commit_page || cpu_buffer->tail_page)) { + pr_warn("Ring buffer meta buffers not all mapped\n"); + if (!cpu_buffer->head_page) + pr_warn(" Missing head_page\n"); + if (!cpu_buffer->commit_page) + pr_warn(" Missing commit_page\n"); + if (!cpu_buffer->tail_page) + pr_warn(" Missing tail_page\n"); + } - rb_head_page_activate(cpu_buffer); + cpu_buffer->head_page + = list_entry(cpu_buffer->pages, struct buffer_page, list); + cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; + + rb_head_page_activate(cpu_buffer); + + if (cpu_buffer->ring_meta) + meta->commit_buffer = meta->head_buffer; + } else { + /* The valid meta buffer still needs to activate the head page */ + rb_head_page_activate(cpu_buffer); + } return cpu_buffer; @@ -1624,22 +2310,14 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) kfree(cpu_buffer); } -/** - * __ring_buffer_alloc - allocate a new ring_buffer - * @size: the size in bytes per cpu that is needed. - * @flags: attributes to set for the ring buffer. - * @key: ring buffer reader_lock_key. - * - * Currently the only flag that is available is the RB_FL_OVERWRITE - * flag. This flag means that the buffer will overwrite old data - * when the buffer wraps. If this flag is not set, the buffer will - * drop data when the tail hits the head. - */ -struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, - struct lock_class_key *key) +static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, + int order, unsigned long start, + unsigned long end, + struct lock_class_key *key) { struct trace_buffer *buffer; long nr_pages; + int subbuf_size; int bsize; int cpu; int ret; @@ -1653,14 +2331,13 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) goto fail_free_buffer; - /* Default buffer page size - one system page */ - buffer->subbuf_order = 0; - buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE; + buffer->subbuf_order = order; + subbuf_size = (PAGE_SIZE << order); + buffer->subbuf_size = subbuf_size - BUF_PAGE_HDR_SIZE; /* Max payload is buffer page size - header (8bytes) */ buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2); - nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size); buffer->flags = flags; buffer->clock = trace_clock_local; buffer->reader_lock_key = key; @@ -1668,10 +2345,6 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters); init_waitqueue_head(&buffer->irq_work.waiters); - /* need at least two pages */ - if (nr_pages < 2) - nr_pages = 2; - buffer->cpus = nr_cpu_ids; bsize = sizeof(void *) * nr_cpu_ids; @@ -1680,6 +2353,56 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (!buffer->buffers) goto fail_free_cpumask; + /* If start/end are specified, then that overrides size */ + if (start && end) { + unsigned long ptr; + int n; + + size = end - start; + size = size / nr_cpu_ids; + + /* + * The number of sub-buffers (nr_pages) is determined by the + * total size allocated minus the meta data size. + * Then that is divided by the number of per CPU buffers + * needed, plus account for the integer array index that + * will be appended to the meta data. + */ + nr_pages = (size - sizeof(struct ring_buffer_meta)) / + (subbuf_size + sizeof(int)); + /* Need at least two pages plus the reader page */ + if (nr_pages < 3) + goto fail_free_buffers; + + again: + /* Make sure that the size fits aligned */ + for (n = 0, ptr = start; n < nr_cpu_ids; n++) { + ptr += sizeof(struct ring_buffer_meta) + + sizeof(int) * nr_pages; + ptr = ALIGN(ptr, subbuf_size); + ptr += subbuf_size * nr_pages; + } + if (ptr > end) { + if (nr_pages <= 3) + goto fail_free_buffers; + nr_pages--; + goto again; + } + + /* nr_pages should not count the reader page */ + nr_pages--; + buffer->range_addr_start = start; + buffer->range_addr_end = end; + + rb_range_meta_init(buffer, nr_pages); + } else { + + /* need at least two pages */ + nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size); + if (nr_pages < 2) + nr_pages = 2; + } + cpu = raw_smp_processor_id(); cpumask_set_cpu(cpu, buffer->cpumask); buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); @@ -1708,9 +2431,73 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, kfree(buffer); return NULL; } + +/** + * __ring_buffer_alloc - allocate a new ring_buffer + * @size: the size in bytes per cpu that is needed. + * @flags: attributes to set for the ring buffer. + * @key: ring buffer reader_lock_key. + * + * Currently the only flag that is available is the RB_FL_OVERWRITE + * flag. This flag means that the buffer will overwrite old data + * when the buffer wraps. If this flag is not set, the buffer will + * drop data when the tail hits the head. + */ +struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, + struct lock_class_key *key) +{ + /* Default buffer page size - one system page */ + return alloc_buffer(size, flags, 0, 0, 0,key); + +} EXPORT_SYMBOL_GPL(__ring_buffer_alloc); /** + * __ring_buffer_alloc_range - allocate a new ring_buffer from existing memory + * @size: the size in bytes per cpu that is needed. + * @flags: attributes to set for the ring buffer. + * @order: sub-buffer order + * @start: start of allocated range + * @range_size: size of allocated range + * @key: ring buffer reader_lock_key. + * + * Currently the only flag that is available is the RB_FL_OVERWRITE + * flag. This flag means that the buffer will overwrite old data + * when the buffer wraps. If this flag is not set, the buffer will + * drop data when the tail hits the head. + */ +struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags, + int order, unsigned long start, + unsigned long range_size, + struct lock_class_key *key) +{ + return alloc_buffer(size, flags, order, start, start + range_size, key); +} + +/** + * ring_buffer_last_boot_delta - return the delta offset from last boot + * @buffer: The buffer to return the delta from + * @text: Return text delta + * @data: Return data delta + * + * Returns: The true if the delta is non zero + */ +bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text, + long *data) +{ + if (!buffer) + return false; + + if (!buffer->last_text_delta) + return false; + + *text = buffer->last_text_delta; + *data = buffer->last_data_delta; + + return true; +} + +/** * ring_buffer_free - free a ring buffer. * @buffer: the buffer to free. */ @@ -1749,8 +2536,6 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer) return buffer->time_stamp_abs; } -static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); - static inline unsigned long rb_page_entries(struct buffer_page *bpage) { return local_read(&bpage->entries) & RB_WRITE_MASK; @@ -1819,6 +2604,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) /* make sure pages points to a valid page in the ring buffer */ cpu_buffer->pages = next_page; + cpu_buffer->cnt++; /* update head page */ if (head_bit) @@ -1925,6 +2711,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) * pointer to point to end of list */ head_page->prev = last_page; + cpu_buffer->cnt++; success = true; break; } @@ -2278,7 +3065,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter) /* Size is determined by what has been committed */ static __always_inline unsigned rb_page_size(struct buffer_page *bpage) { - return rb_page_commit(bpage); + return rb_page_commit(bpage) & ~RB_MISSED_MASK; } static __always_inline unsigned @@ -2317,6 +3104,52 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) iter->next_event = 0; } +/* Return the index into the sub-buffers for a given sub-buffer */ +static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf) +{ + void *subbuf_array; + + subbuf_array = (void *)meta + sizeof(int) * meta->nr_subbufs; + subbuf_array = (void *)ALIGN((unsigned long)subbuf_array, meta->subbuf_size); + return (subbuf - subbuf_array) / meta->subbuf_size; +} + +static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *next_page) +{ + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + unsigned long old_head = (unsigned long)next_page->page; + unsigned long new_head; + + rb_inc_page(&next_page); + new_head = (unsigned long)next_page->page; + + /* + * Only move it forward once, if something else came in and + * moved it forward, then we don't want to touch it. + */ + (void)cmpxchg(&meta->head_buffer, old_head, new_head); +} + +static void rb_update_meta_reader(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *reader) +{ + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + void *old_reader = cpu_buffer->reader_page->page; + void *new_reader = reader->page; + int id; + + id = reader->id; + cpu_buffer->reader_page->id = id; + reader->id = 0; + + meta->buffers[0] = rb_meta_subbuf_idx(meta, new_reader); + meta->buffers[id] = rb_meta_subbuf_idx(meta, old_reader); + + /* The head pointer is the one after the reader */ + rb_update_meta_head(cpu_buffer, reader); +} + /* * rb_handle_head_page - writer hit the head page * @@ -2366,6 +3199,8 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes); local_inc(&cpu_buffer->pages_lost); + if (cpu_buffer->ring_meta) + rb_update_meta_head(cpu_buffer, next_page); /* * The entries will be zeroed out when we move the * tail page. @@ -2927,6 +3762,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); rb_inc_page(&cpu_buffer->commit_page); + if (cpu_buffer->ring_meta) { + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + meta->commit_buffer = (unsigned long)cpu_buffer->commit_page->page; + } /* add barrier to keep gcc from optimizing too much */ barrier(); } @@ -3241,7 +4080,7 @@ static const char *show_irq_str(int bits) return type[bits]; } -/* Assume this is an trace event */ +/* Assume this is a trace event */ static const char *show_flags(struct ring_buffer_event *event) { struct trace_entry *entry; @@ -3373,11 +4212,10 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info, unsigned long tail) { - struct ring_buffer_event *event; struct buffer_data_page *bpage; u64 ts, delta; bool full = false; - int e; + int ret; bpage = info->tail_page->page; @@ -3403,39 +4241,12 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, if (atomic_inc_return(this_cpu_ptr(&checking)) != 1) goto out; - ts = bpage->time_stamp; - - for (e = 0; e < tail; e += rb_event_length(event)) { - - event = (struct ring_buffer_event *)(bpage->data + e); - - switch (event->type_len) { - - case RINGBUF_TYPE_TIME_EXTEND: - delta = rb_event_time_stamp(event); - ts += delta; - break; - - case RINGBUF_TYPE_TIME_STAMP: - delta = rb_event_time_stamp(event); - delta = rb_fix_abs_ts(delta, ts); - if (delta < ts) { - buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n", - cpu_buffer->cpu, ts, delta); - } - ts = delta; - break; - - case RINGBUF_TYPE_PADDING: - if (event->time_delta == 1) - break; - fallthrough; - case RINGBUF_TYPE_DATA: - ts += event->time_delta; - break; - - default: - RB_WARN_ON(cpu_buffer, 1); + ret = rb_read_data_buffer(bpage, tail, cpu_buffer->cpu, &ts, &delta); + if (ret < 0) { + if (delta < ts) { + buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n", + cpu_buffer->cpu, ts, delta); + goto out; } } if ((full && ts > info->ts) || @@ -3610,8 +4421,13 @@ rb_reserve_next_event(struct trace_buffer *buffer, int nr_loops = 0; int add_ts_default; - /* ring buffer does cmpxchg, make sure it is safe in NMI context */ - if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) && + /* + * ring buffer does cmpxchg as well as atomic64 operations + * (which some archs use locking for atomic64), make sure this + * is safe in NMI context + */ + if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) || + IS_ENABLED(CONFIG_GENERIC_ATOMIC64)) && (unlikely(in_nmi()))) { return NULL; } @@ -3894,40 +4710,22 @@ int ring_buffer_write(struct trace_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_write); -static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) +/* + * The total entries in the ring buffer is the running counter + * of entries entered into the ring buffer, minus the sum of + * the entries read from the ring buffer and the number of + * entries that were overwritten. + */ +static inline unsigned long +rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) { - struct buffer_page *reader = cpu_buffer->reader_page; - struct buffer_page *head = rb_set_head_page(cpu_buffer); - struct buffer_page *commit = cpu_buffer->commit_page; - - /* In case of error, head will be NULL */ - if (unlikely(!head)) - return true; - - /* Reader should exhaust content in reader page */ - if (reader->read != rb_page_commit(reader)) - return false; - - /* - * If writers are committing on the reader page, knowing all - * committed content has been read, the ring buffer is empty. - */ - if (commit == reader) - return true; - - /* - * If writers are committing on a page other than reader page - * and head page, there should always be content to read. - */ - if (commit != head) - return false; + return local_read(&cpu_buffer->entries) - + (local_read(&cpu_buffer->overrun) + cpu_buffer->read); +} - /* - * Writers are committing on the head page, we just need - * to care about there're committed data, and the reader will - * swap reader page with head page when it is to read data. - */ - return rb_page_commit(commit) == 0; +static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) +{ + return !rb_num_of_entries(cpu_buffer); } /** @@ -4073,19 +4871,6 @@ void ring_buffer_record_enable_cpu(struct trace_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); -/* - * The total entries in the ring buffer is the running counter - * of entries entered into the ring buffer, minus the sum of - * the entries read from the ring buffer and the number of - * entries that were overwritten. - */ -static inline unsigned long -rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) -{ - return local_read(&cpu_buffer->entries) - - (local_read(&cpu_buffer->overrun) + cpu_buffer->read); -} - /** * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer * @buffer: The ring buffer @@ -4350,7 +5135,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter) cpu_buffer = iter->cpu_buffer; reader = cpu_buffer->reader_page; head_page = cpu_buffer->head_page; - commit_page = cpu_buffer->commit_page; + commit_page = READ_ONCE(cpu_buffer->commit_page); commit_ts = commit_page->page->time_stamp; /* @@ -4376,7 +5161,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter) return ((iter->head_page == commit_page && iter->head >= commit) || (iter->head_page == reader && commit_page == head_page && head_page->read == commit && - iter->head == rb_page_commit(cpu_buffer->reader_page))); + iter->head == rb_page_size(cpu_buffer->reader_page))); } EXPORT_SYMBOL_GPL(ring_buffer_iter_empty); @@ -4544,6 +5329,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) if (!ret) goto spin; + if (cpu_buffer->ring_meta) + rb_update_meta_reader(cpu_buffer, reader); + /* * Yay! We succeeded in replacing the page. * @@ -4552,6 +5340,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list; rb_inc_page(&cpu_buffer->head_page); + cpu_buffer->cnt++; local_inc(&cpu_buffer->pages_read); /* Finally update the reader page to the new head */ @@ -4996,13 +5785,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); * @flags: gfp flags to use for memory allocation * * This performs the initial preparations necessary to iterate - * through the buffer. Memory is allocated, buffer recording + * through the buffer. Memory is allocated, buffer resizing * is disabled, and the iterator pointer is returned to the caller. * - * Disabling buffer recording prevents the reading from being - * corrupted. This is not a consuming read, so a producer is not - * expected. - * * After a sequence of ring_buffer_read_prepare calls, the user is * expected to make at least one call to ring_buffer_read_prepare_sync. * Afterwards, ring_buffer_read_start is invoked to get things going @@ -5089,24 +5874,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_start); * ring_buffer_read_finish - finish reading the iterator of the buffer * @iter: The iterator retrieved by ring_buffer_start * - * This re-enables the recording to the buffer, and frees the - * iterator. + * This re-enables resizing of the buffer, and frees the iterator. */ void ring_buffer_read_finish(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; - unsigned long flags; - /* - * Ring buffer is disabled from recording, here's a good place - * to check the integrity of the ring buffer. - * Must prevent readers from trying to read, as the check - * clears the HEAD page and readers require it. - */ - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + /* Use this opportunity to check the integrity of the ring buffer. */ rb_check_pages(cpu_buffer); - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); atomic_dec(&cpu_buffer->resize_disabled); kfree(iter->event); @@ -5171,6 +5947,25 @@ static void rb_clear_buffer_page(struct buffer_page *page) page->read = 0; } +static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct trace_buffer_meta *meta = cpu_buffer->meta_page; + + if (!meta) + return; + + meta->reader.read = cpu_buffer->reader_page->read; + meta->reader.id = cpu_buffer->reader_page->id; + meta->reader.lost_events = cpu_buffer->lost_events; + + meta->entries = local_read(&cpu_buffer->entries); + meta->overrun = local_read(&cpu_buffer->overrun); + meta->read = cpu_buffer->read; + + /* Some archs do not have data cache coherency between kernel and user-space */ + flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page)); +} + static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { @@ -5217,6 +6012,14 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) rb_head_page_activate(cpu_buffer); cpu_buffer->pages_removed = 0; + + if (cpu_buffer->mapped) { + rb_update_meta_page(cpu_buffer); + if (cpu_buffer->ring_meta) { + struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + meta->commit_buffer = meta->head_buffer; + } + } } /* Must have disabled the cpu buffer then done a synchronize_rcu */ @@ -5247,6 +6050,7 @@ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_meta *meta; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; @@ -5265,6 +6069,11 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) atomic_dec(&cpu_buffer->record_disabled); atomic_dec(&cpu_buffer->resize_disabled); + /* Make sure persistent meta now uses this buffer's addresses */ + meta = rb_range_meta(buffer, 0, cpu_buffer->cpu); + if (meta) + rb_meta_init_text_addr(meta); + mutex_unlock(&buffer->mutex); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -5279,6 +6088,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_meta *meta; int cpu; /* prevent another thread from changing buffer sizes */ @@ -5306,6 +6116,11 @@ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) reset_disabled_cpu_buffer(cpu_buffer); + /* Make sure persistent meta now uses this buffer's addresses */ + meta = rb_range_meta(buffer, 0, cpu_buffer->cpu); + if (meta) + rb_meta_init_text_addr(meta); + atomic_dec(&cpu_buffer->record_disabled); atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled); } @@ -5429,6 +6244,12 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, cpu_buffer_a = buffer_a->buffers[cpu]; cpu_buffer_b = buffer_b->buffers[cpu]; + /* It's up to the callers to not try to swap mapped buffers */ + if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) { + ret = -EBUSY; + goto out; + } + /* At least make sure the two buffers are somewhat the same */ if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) goto out; @@ -5538,7 +6359,8 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) if (bpage->data) goto out; - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, + page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_NORETRY | __GFP_COMP | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) { kfree(bpage); @@ -5679,7 +6501,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, event = rb_reader_event(cpu_buffer); read = reader->read; - commit = rb_page_commit(reader); + commit = rb_page_size(reader); /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; @@ -5692,7 +6514,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, * Otherwise, we can simply swap the page with the one passed in. */ if (read || (len < (commit - read)) || - cpu_buffer->reader_page == cpu_buffer->commit_page) { + cpu_buffer->reader_page == cpu_buffer->commit_page || + cpu_buffer->mapped) { struct buffer_data_page *rpage = cpu_buffer->reader_page->page; unsigned int rpos = read; unsigned int pos = 0; @@ -5755,7 +6578,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, } else { /* update the entry counter */ cpu_buffer->read += rb_page_entries(reader); - cpu_buffer->read_bytes += rb_page_commit(reader); + cpu_buffer->read_bytes += rb_page_size(reader); /* swap the pages */ rb_init_page(bpage); @@ -5915,6 +6738,11 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) cpu_buffer = buffer->buffers[cpu]; + if (cpu_buffer->mapped) { + err = -EBUSY; + goto error; + } + /* Update the number of pages to match the new size */ nr_pages = old_size * buffer->buffers[cpu]->nr_pages; nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size); @@ -5939,39 +6767,39 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) } for_each_buffer_cpu(buffer, cpu) { + struct buffer_data_page *old_free_data_page; + struct list_head old_pages; + unsigned long flags; if (!cpumask_test_cpu(cpu, buffer->cpumask)) continue; cpu_buffer = buffer->buffers[cpu]; + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + /* Clear the head bit to make the link list normal to read */ rb_head_page_deactivate(cpu_buffer); - /* Now walk the list and free all the old sub buffers */ - list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) { - list_del_init(&bpage->list); - free_buffer_page(bpage); - } - /* The above loop stopped an the last page needing to be freed */ - bpage = list_entry(cpu_buffer->pages, struct buffer_page, list); - free_buffer_page(bpage); - - /* Free the current reader page */ - free_buffer_page(cpu_buffer->reader_page); + /* + * Collect buffers from the cpu_buffer pages list and the + * reader_page on old_pages, so they can be freed later when not + * under a spinlock. The pages list is a linked list with no + * head, adding old_pages turns it into a regular list with + * old_pages being the head. + */ + list_add(&old_pages, cpu_buffer->pages); + list_add(&cpu_buffer->reader_page->list, &old_pages); /* One page was allocated for the reader page */ cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next, struct buffer_page, list); list_del_init(&cpu_buffer->reader_page->list); - /* The cpu_buffer pages are a link list with no head */ + /* Install the new pages, remove the head from the list */ cpu_buffer->pages = cpu_buffer->new_pages.next; - cpu_buffer->new_pages.next->prev = cpu_buffer->new_pages.prev; - cpu_buffer->new_pages.prev->next = cpu_buffer->new_pages.next; - - /* Clear the new_pages list */ - INIT_LIST_HEAD(&cpu_buffer->new_pages); + list_del_init(&cpu_buffer->new_pages); + cpu_buffer->cnt++; cpu_buffer->head_page = list_entry(cpu_buffer->pages, struct buffer_page, list); @@ -5980,11 +6808,20 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update; cpu_buffer->nr_pages_to_update = 0; - free_pages((unsigned long)cpu_buffer->free_page, old_order); + old_free_data_page = cpu_buffer->free_page; cpu_buffer->free_page = NULL; rb_head_page_activate(cpu_buffer); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + /* Free old sub buffers */ + list_for_each_entry_safe(bpage, tmp, &old_pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + free_pages((unsigned long)old_free_data_page, old_order); + rb_check_pages(cpu_buffer); } @@ -6016,6 +6853,441 @@ error: } EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set); +static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct page *page; + + if (cpu_buffer->meta_page) + return 0; + + page = alloc_page(GFP_USER | __GFP_ZERO); + if (!page) + return -ENOMEM; + + cpu_buffer->meta_page = page_to_virt(page); + + return 0; +} + +static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long addr = (unsigned long)cpu_buffer->meta_page; + + free_page(addr); + cpu_buffer->meta_page = NULL; +} + +static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, + unsigned long *subbuf_ids) +{ + struct trace_buffer_meta *meta = cpu_buffer->meta_page; + unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; + struct buffer_page *first_subbuf, *subbuf; + int id = 0; + + subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; + cpu_buffer->reader_page->id = id++; + + first_subbuf = subbuf = rb_set_head_page(cpu_buffer); + do { + if (WARN_ON(id >= nr_subbufs)) + break; + + subbuf_ids[id] = (unsigned long)subbuf->page; + subbuf->id = id; + + rb_inc_page(&subbuf); + id++; + } while (subbuf != first_subbuf); + + /* install subbuf ID to kern VA translation */ + cpu_buffer->subbuf_ids = subbuf_ids; + + meta->meta_struct_len = sizeof(*meta); + meta->nr_subbufs = nr_subbufs; + meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE; + meta->meta_page_size = meta->subbuf_size; + + rb_update_meta_page(cpu_buffer); +} + +static struct ring_buffer_per_cpu * +rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return ERR_PTR(-EINVAL); + + cpu_buffer = buffer->buffers[cpu]; + + mutex_lock(&cpu_buffer->mapping_lock); + + if (!cpu_buffer->user_mapped) { + mutex_unlock(&cpu_buffer->mapping_lock); + return ERR_PTR(-ENODEV); + } + + return cpu_buffer; +} + +static void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer) +{ + mutex_unlock(&cpu_buffer->mapping_lock); +} + +/* + * Fast-path for rb_buffer_(un)map(). Called whenever the meta-page doesn't need + * to be set-up or torn-down. + */ +static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer, + bool inc) +{ + unsigned long flags; + + lockdep_assert_held(&cpu_buffer->mapping_lock); + + /* mapped is always greater or equal to user_mapped */ + if (WARN_ON(cpu_buffer->mapped < cpu_buffer->user_mapped)) + return -EINVAL; + + if (inc && cpu_buffer->mapped == UINT_MAX) + return -EBUSY; + + if (WARN_ON(!inc && cpu_buffer->user_mapped == 0)) + return -EINVAL; + + mutex_lock(&cpu_buffer->buffer->mutex); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + if (inc) { + cpu_buffer->user_mapped++; + cpu_buffer->mapped++; + } else { + cpu_buffer->user_mapped--; + cpu_buffer->mapped--; + } + + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + mutex_unlock(&cpu_buffer->buffer->mutex); + + return 0; +} + +/* + * +--------------+ pgoff == 0 + * | meta page | + * +--------------+ pgoff == 1 + * | subbuffer 0 | + * | | + * +--------------+ pgoff == (1 + (1 << subbuf_order)) + * | subbuffer 1 | + * | | + * ... + */ +#ifdef CONFIG_MMU +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, + struct vm_area_struct *vma) +{ + unsigned long nr_subbufs, nr_pages, nr_vma_pages, pgoff = vma->vm_pgoff; + unsigned int subbuf_pages, subbuf_order; + struct page **pages; + int p = 0, s = 0; + int err; + + /* Refuse MP_PRIVATE or writable mappings */ + if (vma->vm_flags & VM_WRITE || vma->vm_flags & VM_EXEC || + !(vma->vm_flags & VM_MAYSHARE)) + return -EPERM; + + subbuf_order = cpu_buffer->buffer->subbuf_order; + subbuf_pages = 1 << subbuf_order; + + if (subbuf_order && pgoff % subbuf_pages) + return -EINVAL; + + /* + * Make sure the mapping cannot become writable later. Also tell the VM + * to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND). + */ + vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP, + VM_MAYWRITE); + + lockdep_assert_held(&cpu_buffer->mapping_lock); + + nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */ + nr_pages = ((nr_subbufs + 1) << subbuf_order); /* + meta-page */ + if (nr_pages <= pgoff) + return -EINVAL; + + nr_pages -= pgoff; + + nr_vma_pages = vma_pages(vma); + if (!nr_vma_pages || nr_vma_pages > nr_pages) + return -EINVAL; + + nr_pages = nr_vma_pages; + + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + if (!pgoff) { + unsigned long meta_page_padding; + + pages[p++] = virt_to_page(cpu_buffer->meta_page); + + /* + * Pad with the zero-page to align the meta-page with the + * sub-buffers. + */ + meta_page_padding = subbuf_pages - 1; + while (meta_page_padding-- && p < nr_pages) { + unsigned long __maybe_unused zero_addr = + vma->vm_start + (PAGE_SIZE * p); + + pages[p++] = ZERO_PAGE(zero_addr); + } + } else { + /* Skip the meta-page */ + pgoff -= subbuf_pages; + + s += pgoff / subbuf_pages; + } + + while (p < nr_pages) { + struct page *page; + int off = 0; + + if (WARN_ON_ONCE(s >= nr_subbufs)) { + err = -EINVAL; + goto out; + } + + page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); + + for (; off < (1 << (subbuf_order)); off++, page++) { + if (p >= nr_pages) + break; + + pages[p++] = page; + } + s++; + } + + err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); + +out: + kfree(pages); + + return err; +} +#else +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, + struct vm_area_struct *vma) +{ + return -EOPNOTSUPP; +} +#endif + +int ring_buffer_map(struct trace_buffer *buffer, int cpu, + struct vm_area_struct *vma) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags, *subbuf_ids; + int err = 0; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + + cpu_buffer = buffer->buffers[cpu]; + + mutex_lock(&cpu_buffer->mapping_lock); + + if (cpu_buffer->user_mapped) { + err = __rb_map_vma(cpu_buffer, vma); + if (!err) + err = __rb_inc_dec_mapped(cpu_buffer, true); + mutex_unlock(&cpu_buffer->mapping_lock); + return err; + } + + /* prevent another thread from changing buffer/sub-buffer sizes */ + mutex_lock(&buffer->mutex); + + err = rb_alloc_meta_page(cpu_buffer); + if (err) + goto unlock; + + /* subbuf_ids include the reader while nr_pages does not */ + subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL); + if (!subbuf_ids) { + rb_free_meta_page(cpu_buffer); + err = -ENOMEM; + goto unlock; + } + + atomic_inc(&cpu_buffer->resize_disabled); + + /* + * Lock all readers to block any subbuf swap until the subbuf IDs are + * assigned. + */ + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + rb_setup_ids_meta_page(cpu_buffer, subbuf_ids); + + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + err = __rb_map_vma(cpu_buffer, vma); + if (!err) { + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + /* This is the first time it is mapped by user */ + cpu_buffer->mapped++; + cpu_buffer->user_mapped = 1; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + } else { + kfree(cpu_buffer->subbuf_ids); + cpu_buffer->subbuf_ids = NULL; + rb_free_meta_page(cpu_buffer); + atomic_dec(&cpu_buffer->resize_disabled); + } + +unlock: + mutex_unlock(&buffer->mutex); + mutex_unlock(&cpu_buffer->mapping_lock); + + return err; +} + +int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + int err = 0; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + + cpu_buffer = buffer->buffers[cpu]; + + mutex_lock(&cpu_buffer->mapping_lock); + + if (!cpu_buffer->user_mapped) { + err = -ENODEV; + goto out; + } else if (cpu_buffer->user_mapped > 1) { + __rb_inc_dec_mapped(cpu_buffer, false); + goto out; + } + + mutex_lock(&buffer->mutex); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + /* This is the last user space mapping */ + if (!WARN_ON_ONCE(cpu_buffer->mapped < cpu_buffer->user_mapped)) + cpu_buffer->mapped--; + cpu_buffer->user_mapped = 0; + + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + kfree(cpu_buffer->subbuf_ids); + cpu_buffer->subbuf_ids = NULL; + rb_free_meta_page(cpu_buffer); + atomic_dec(&cpu_buffer->resize_disabled); + + mutex_unlock(&buffer->mutex); + +out: + mutex_unlock(&cpu_buffer->mapping_lock); + + return err; +} + +int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *reader; + unsigned long missed_events; + unsigned long reader_size; + unsigned long flags; + + cpu_buffer = rb_get_mapped_buffer(buffer, cpu); + if (IS_ERR(cpu_buffer)) + return (int)PTR_ERR(cpu_buffer); + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + +consume: + if (rb_per_cpu_empty(cpu_buffer)) + goto out; + + reader_size = rb_page_size(cpu_buffer->reader_page); + + /* + * There are data to be read on the current reader page, we can + * return to the caller. But before that, we assume the latter will read + * everything. Let's update the kernel reader accordingly. + */ + if (cpu_buffer->reader_page->read < reader_size) { + while (cpu_buffer->reader_page->read < reader_size) + rb_advance_reader(cpu_buffer); + goto out; + } + + reader = rb_get_reader_page(cpu_buffer); + if (WARN_ON(!reader)) + goto out; + + /* Check if any events were dropped */ + missed_events = cpu_buffer->lost_events; + + if (cpu_buffer->reader_page != cpu_buffer->commit_page) { + if (missed_events) { + struct buffer_data_page *bpage = reader->page; + unsigned int commit; + /* + * Use the real_end for the data size, + * This gives us a chance to store the lost events + * on the page. + */ + if (reader->real_end) + local_set(&bpage->commit, reader->real_end); + /* + * If there is room at the end of the page to save the + * missed events, then record it there. + */ + commit = rb_page_size(reader); + if (buffer->subbuf_size - commit >= sizeof(missed_events)) { + memcpy(&bpage->data[commit], &missed_events, + sizeof(missed_events)); + local_add(RB_MISSED_STORED, &bpage->commit); + } + local_add(RB_MISSED_EVENTS, &bpage->commit); + } + } else { + /* + * There really shouldn't be any missed events if the commit + * is on the reader page. + */ + WARN_ON_ONCE(missed_events); + } + + cpu_buffer->lost_events = 0; + + goto consume; + +out: + /* Some archs do not have data cache coherency between kernel and user-space */ + flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page)); + + rb_update_meta_page(cpu_buffer); + + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + rb_put_mapped_buffer(cpu_buffer); + + return 0; +} + /* * We only allocate new buffers, never free them if the CPU goes down. * If we were to free the buffer, then the user would lose any trace that was in diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 008187ebd7fe..cdc3aea12c93 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -307,14 +307,14 @@ static void ring_buffer_producer(void) if (!disable_reader) { if (consumer_fifo) trace_printk("Running Consumer at SCHED_FIFO %s\n", - consumer_fifo == 1 ? "low" : "high"); + str_low_high(consumer_fifo == 1)); else trace_printk("Running Consumer at nice: %d\n", consumer_nice); } if (producer_fifo) trace_printk("Running Producer at SCHED_FIFO %s\n", - producer_fifo == 1 ? "low" : "high"); + str_low_high(producer_fifo == 1)); else trace_printk("Running Producer at nice: %d\n", producer_nice); diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 831779607e84..8226352a0062 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -25,30 +25,9 @@ menuconfig RV For further information, see: Documentation/trace/rv/runtime-verification.rst -config RV_MON_WIP - depends on RV - depends on PREEMPT_TRACER - select DA_MON_EVENTS_IMPLICIT - bool "wip monitor" - help - Enable wip (wakeup in preemptive) sample monitor that illustrates - the usage of per-cpu monitors, and one limitation of the - preempt_disable/enable events. - - For further information, see: - Documentation/trace/rv/monitor_wip.rst - -config RV_MON_WWNR - depends on RV - select DA_MON_EVENTS_ID - bool "wwnr monitor" - help - Enable wwnr (wakeup while not running) sample monitor, this is a - sample monitor that illustrates the usage of per-task monitor. - The model is borken on purpose: it serves to test reactors. - - For further information, see: - Documentation/trace/rv/monitor_wwnr.rst +source "kernel/trace/rv/monitors/wip/Kconfig" +source "kernel/trace/rv/monitors/wwnr/Kconfig" +# Add new monitors here config RV_REACTORS bool "Runtime verification reactors" diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index 963d14875b45..188b64668e1f 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -1,8 +1,11 @@ # SPDX-License-Identifier: GPL-2.0 +ccflags-y += -I $(src) # needed for trace events + obj-$(CONFIG_RV) += rv.o obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o +# Add new monitors here obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o obj-$(CONFIG_RV_REACT_PANIC) += reactor_panic.o diff --git a/kernel/trace/rv/monitors/wip/Kconfig b/kernel/trace/rv/monitors/wip/Kconfig new file mode 100644 index 000000000000..3ef664b5cd90 --- /dev/null +++ b/kernel/trace/rv/monitors/wip/Kconfig @@ -0,0 +1,12 @@ +config RV_MON_WIP + depends on RV + depends on PREEMPT_TRACER + select DA_MON_EVENTS_IMPLICIT + bool "wip monitor" + help + Enable wip (wakeup in preemptive) sample monitor that illustrates + the usage of per-cpu monitors, and one limitation of the + preempt_disable/enable events. + + For further information, see: + Documentation/trace/rv/monitor_wip.rst diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c index b2b49a27e886..db7389157c87 100644 --- a/kernel/trace/rv/monitors/wip/wip.c +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -10,7 +10,7 @@ #define MODULE_NAME "wip" -#include <trace/events/rv.h> +#include <rv_trace.h> #include <trace/events/sched.h> #include <trace/events/preemptirq.h> diff --git a/kernel/trace/rv/monitors/wip/wip_trace.h b/kernel/trace/rv/monitors/wip/wip_trace.h new file mode 100644 index 000000000000..aa2162f47a4c --- /dev/null +++ b/kernel/trace/rv/monitors/wip/wip_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_WIP +DEFINE_EVENT(event_da_monitor, event_wip, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_wip, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_WIP */ diff --git a/kernel/trace/rv/monitors/wwnr/Kconfig b/kernel/trace/rv/monitors/wwnr/Kconfig new file mode 100644 index 000000000000..ee741aa6d6b8 --- /dev/null +++ b/kernel/trace/rv/monitors/wwnr/Kconfig @@ -0,0 +1,11 @@ +config RV_MON_WWNR + depends on RV + select DA_MON_EVENTS_ID + bool "wwnr monitor" + help + Enable wwnr (wakeup while not running) sample monitor, this is a + sample monitor that illustrates the usage of per-task monitor. + The model is borken on purpose: it serves to test reactors. + + For further information, see: + Documentation/trace/rv/monitor_wwnr.rst diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c index 0e43dd2db685..3b16994a9984 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.c +++ b/kernel/trace/rv/monitors/wwnr/wwnr.c @@ -10,7 +10,7 @@ #define MODULE_NAME "wwnr" -#include <trace/events/rv.h> +#include <rv_trace.h> #include <trace/events/sched.h> #include "wwnr.h" diff --git a/kernel/trace/rv/monitors/wwnr/wwnr_trace.h b/kernel/trace/rv/monitors/wwnr/wwnr_trace.h new file mode 100644 index 000000000000..fc97ec7476ad --- /dev/null +++ b/kernel/trace/rv/monitors/wwnr/wwnr_trace.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_WWNR +/* id is the pid of the task */ +DEFINE_EVENT(event_da_monitor_id, event_wwnr, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_wwnr, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); +#endif /* CONFIG_RV_MON_WWNR */ diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 2f68e93fff0b..8657fc8806e7 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -41,7 +41,7 @@ * per-task monitor, and so on), and the helper functions that glue the * monitor to the system via trace. Generally, a monitor includes some form * of trace output as a reaction for event parsing and exceptions, - * as depicted bellow: + * as depicted below: * * Linux +----- RV Monitor ----------------------------------+ Formal * Realm | | Realm @@ -145,7 +145,7 @@ #ifdef CONFIG_DA_MON_EVENTS #define CREATE_TRACE_POINTS -#include <trace/events/rv.h> +#include <rv_trace.h> #endif #include "rv.h" @@ -245,6 +245,7 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) /** * rv_disable_monitor - disable a given runtime monitor + * @mdef: Pointer to the monitor definition structure. * * Returns 0 on success. */ @@ -256,6 +257,7 @@ int rv_disable_monitor(struct rv_monitor_def *mdef) /** * rv_enable_monitor - enable a given runtime monitor + * @mdef: Pointer to the monitor definition structure. * * Returns 0 on success, error otherwise. */ @@ -304,7 +306,6 @@ static ssize_t monitor_enable_write_data(struct file *filp, const char __user *u static const struct file_operations interface_enable_fops = { .open = simple_open, - .llseek = no_llseek, .write = monitor_enable_write_data, .read = monitor_enable_read_data, }; @@ -327,7 +328,6 @@ static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf, static const struct file_operations interface_desc_fops = { .open = simple_open, - .llseek = no_llseek, .read = monitor_desc_read_data, }; @@ -672,7 +672,6 @@ static ssize_t monitoring_on_write_data(struct file *filp, const char __user *us static const struct file_operations monitoring_on_fops = { .open = simple_open, - .llseek = no_llseek, .write = monitoring_on_write_data, .read = monitoring_on_read_data, }; diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c index 6aae106695b6..7b49cbe388d4 100644 --- a/kernel/trace/rv/rv_reactors.c +++ b/kernel/trace/rv/rv_reactors.c @@ -426,7 +426,6 @@ static ssize_t reacting_on_write_data(struct file *filp, const char __user *user static const struct file_operations reacting_on_fops = { .open = simple_open, - .llseek = no_llseek, .write = reacting_on_write_data, .read = reacting_on_read_data, }; diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h new file mode 100644 index 000000000000..5e65097423ba --- /dev/null +++ b/kernel/trace/rv/rv_trace.h @@ -0,0 +1,130 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rv + +#if !defined(_TRACE_RV_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_RV_H + +#include <linux/rv.h> +#include <linux/tracepoint.h> + +#ifdef CONFIG_DA_MON_EVENTS_IMPLICIT +DECLARE_EVENT_CLASS(event_da_monitor, + + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + + TP_ARGS(state, event, next_state, final_state), + + TP_STRUCT__entry( + __array( char, state, MAX_DA_NAME_LEN ) + __array( char, event, MAX_DA_NAME_LEN ) + __array( char, next_state, MAX_DA_NAME_LEN ) + __field( bool, final_state ) + ), + + TP_fast_assign( + memcpy(__entry->state, state, MAX_DA_NAME_LEN); + memcpy(__entry->event, event, MAX_DA_NAME_LEN); + memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN); + __entry->final_state = final_state; + ), + + TP_printk("%s x %s -> %s %s", + __entry->state, + __entry->event, + __entry->next_state, + __entry->final_state ? "(final)" : "") +); + +DECLARE_EVENT_CLASS(error_da_monitor, + + TP_PROTO(char *state, char *event), + + TP_ARGS(state, event), + + TP_STRUCT__entry( + __array( char, state, MAX_DA_NAME_LEN ) + __array( char, event, MAX_DA_NAME_LEN ) + ), + + TP_fast_assign( + memcpy(__entry->state, state, MAX_DA_NAME_LEN); + memcpy(__entry->event, event, MAX_DA_NAME_LEN); + ), + + TP_printk("event %s not expected in the state %s", + __entry->event, + __entry->state) +); + +#include <monitors/wip/wip_trace.h> +// Add new monitors based on CONFIG_DA_MON_EVENTS_IMPLICIT here + +#endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */ + +#ifdef CONFIG_DA_MON_EVENTS_ID +DECLARE_EVENT_CLASS(event_da_monitor_id, + + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + + TP_ARGS(id, state, event, next_state, final_state), + + TP_STRUCT__entry( + __field( int, id ) + __array( char, state, MAX_DA_NAME_LEN ) + __array( char, event, MAX_DA_NAME_LEN ) + __array( char, next_state, MAX_DA_NAME_LEN ) + __field( bool, final_state ) + ), + + TP_fast_assign( + memcpy(__entry->state, state, MAX_DA_NAME_LEN); + memcpy(__entry->event, event, MAX_DA_NAME_LEN); + memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN); + __entry->id = id; + __entry->final_state = final_state; + ), + + TP_printk("%d: %s x %s -> %s %s", + __entry->id, + __entry->state, + __entry->event, + __entry->next_state, + __entry->final_state ? "(final)" : "") +); + +DECLARE_EVENT_CLASS(error_da_monitor_id, + + TP_PROTO(int id, char *state, char *event), + + TP_ARGS(id, state, event), + + TP_STRUCT__entry( + __field( int, id ) + __array( char, state, MAX_DA_NAME_LEN ) + __array( char, event, MAX_DA_NAME_LEN ) + ), + + TP_fast_assign( + memcpy(__entry->state, state, MAX_DA_NAME_LEN); + memcpy(__entry->event, event, MAX_DA_NAME_LEN); + __entry->id = id; + ), + + TP_printk("%d: event %s not expected in the state %s", + __entry->id, + __entry->event, + __entry->state) +); + +#include <monitors/wwnr/wwnr_trace.h> +// Add new monitors based on CONFIG_DA_MON_EVENTS_ID here + +#endif /* CONFIG_DA_MON_EVENTS_ID */ +#endif /* _TRACE_RV_H */ + +/* This part ust be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE rv_trace +#include <trace/define_trace.h> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c9c898307348..0e6d517e74e0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -13,7 +13,7 @@ * Copyright (C) 2004 Nadia Yvette Chambers */ #include <linux/ring_buffer.h> -#include <generated/utsrelease.h> +#include <linux/utsname.h> #include <linux/stacktrace.h> #include <linux/writeback.h> #include <linux/kallsyms.h> @@ -26,6 +26,7 @@ #include <linux/hardirq.h> #include <linux/linkage.h> #include <linux/uaccess.h> +#include <linux/cleanup.h> #include <linux/vmalloc.h> #include <linux/ftrace.h> #include <linux/module.h> @@ -39,7 +40,6 @@ #include <linux/ctype.h> #include <linux/init.h> #include <linux/panic_notifier.h> -#include <linux/kmemleak.h> #include <linux/poll.h> #include <linux/nmi.h> #include <linux/fs.h> @@ -105,7 +105,7 @@ dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) * tracing is active, only save the comm when a trace event * occurred. */ -static DEFINE_PER_CPU(bool, trace_taskinfo_save); +DEFINE_PER_CPU(bool, trace_taskinfo_save); /* * Kill all tracing for good (never come back). @@ -131,9 +131,12 @@ cpumask_var_t __read_mostly tracing_buffer_mask; * /proc/sys/kernel/ftrace_dump_on_oops * Set 1 if you want to dump buffers of all CPUs * Set 2 if you want to dump the buffer of the CPU that triggered oops + * Set instance name if you want to dump the specific trace instance + * Multiple instance dump is also supported, and instances are seperated + * by commas. */ - -enum ftrace_dump_mode ftrace_dump_on_oops; +/* Set to string format zero to disable by default */ +char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0"; /* When set, tracing will stop when a WARN*() is hit */ int __disable_trace_on_warning; @@ -179,7 +182,6 @@ static void ftrace_trace_userstack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx); -#define MAX_TRACER_SIZE 100 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; @@ -202,19 +204,33 @@ static int __init set_cmdline_ftrace(char *str) } __setup("ftrace=", set_cmdline_ftrace); +int ftrace_dump_on_oops_enabled(void) +{ + if (!strcmp("0", ftrace_dump_on_oops)) + return 0; + else + return 1; +} + static int __init set_ftrace_dump_on_oops(char *str) { - if (*str++ != '=' || !*str || !strcmp("1", str)) { - ftrace_dump_on_oops = DUMP_ALL; + if (!*str) { + strscpy(ftrace_dump_on_oops, "1", MAX_TRACER_SIZE); return 1; } - if (!strcmp("orig_cpu", str) || !strcmp("2", str)) { - ftrace_dump_on_oops = DUMP_ORIG; - return 1; - } + if (*str == ',') { + strscpy(ftrace_dump_on_oops, "1", MAX_TRACER_SIZE); + strscpy(ftrace_dump_on_oops + 1, str, MAX_TRACER_SIZE - 1); + return 1; + } + + if (*str++ == '=') { + strscpy(ftrace_dump_on_oops, str, MAX_TRACER_SIZE); + return 1; + } - return 0; + return 0; } __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); @@ -467,7 +483,7 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export); TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \ TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \ TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \ - TRACE_ITER_HASH_PTR) + TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK) /* trace_options that are only supported by global_trace */ #define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \ @@ -475,7 +491,7 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export); /* trace_flags that are default zero for instances */ #define ZEROED_TRACE_FLAGS \ - (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK) + (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK) /* * The global_trace is the descriptor that holds the top-level tracing @@ -485,6 +501,29 @@ static struct trace_array global_trace = { .trace_flags = TRACE_DEFAULT_FLAGS, }; +static struct trace_array *printk_trace = &global_trace; + +static __always_inline bool printk_binsafe(struct trace_array *tr) +{ + /* + * The binary format of traceprintk can cause a crash if used + * by a buffer from another boot. Force the use of the + * non binary version of trace_printk if the trace_printk + * buffer is a boot mapped ring buffer. + */ + return !(tr->flags & TRACE_ARRAY_FL_BOOT); +} + +static void update_printk_trace(struct trace_array *tr) +{ + if (printk_trace == tr) + return; + + printk_trace->trace_flags &= ~TRACE_ITER_TRACE_PRINTK; + printk_trace = tr; + tr->trace_flags |= TRACE_ITER_TRACE_PRINTK; +} + void trace_set_ring_buffer_expanded(struct trace_array *tr) { if (!tr) @@ -497,19 +536,16 @@ LIST_HEAD(ftrace_trace_arrays); int trace_array_get(struct trace_array *this_tr) { struct trace_array *tr; - int ret = -ENODEV; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (tr == this_tr) { tr->ref++; - ret = 0; - break; + return 0; } } - mutex_unlock(&trace_types_lock); - return ret; + return -ENODEV; } static void __trace_array_put(struct trace_array *this_tr) @@ -555,19 +591,6 @@ int tracing_check_open_get_tr(struct trace_array *tr) return 0; } -int call_filter_check_discard(struct trace_event_call *call, void *rec, - struct trace_buffer *buffer, - struct ring_buffer_event *event) -{ - if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && - !filter_match_preds(call->filter, rec)) { - __trace_event_discard_commit(buffer, event); - return 1; - } - - return 0; -} - /** * trace_find_filtered_pid - check if a pid exists in a filtered_pid list * @filtered_pids: The list of pids to check @@ -950,7 +973,8 @@ static inline void trace_access_lock_init(void) #endif #ifdef CONFIG_STACKTRACE -static void __ftrace_trace_stack(struct trace_buffer *buffer, +static void __ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs); static inline void ftrace_trace_stack(struct trace_array *tr, @@ -959,7 +983,8 @@ static inline void ftrace_trace_stack(struct trace_array *tr, int skip, struct pt_regs *regs); #else -static inline void __ftrace_trace_stack(struct trace_buffer *buffer, +static inline void __ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs) { @@ -1102,7 +1127,7 @@ EXPORT_SYMBOL_GPL(__trace_array_puts); */ int __trace_puts(unsigned long ip, const char *str, int size) { - return __trace_array_puts(&global_trace, ip, str, size); + return __trace_array_puts(printk_trace, ip, str, size); } EXPORT_SYMBOL_GPL(__trace_puts); @@ -1113,6 +1138,7 @@ EXPORT_SYMBOL_GPL(__trace_puts); */ int __trace_bputs(unsigned long ip, const char *str) { + struct trace_array *tr = READ_ONCE(printk_trace); struct ring_buffer_event *event; struct trace_buffer *buffer; struct bputs_entry *entry; @@ -1120,14 +1146,17 @@ int __trace_bputs(unsigned long ip, const char *str) int size = sizeof(struct bputs_entry); int ret = 0; - if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) + if (!printk_binsafe(tr)) + return __trace_puts(ip, str, strlen(str)); + + if (!(tr->trace_flags & TRACE_ITER_PRINTK)) return 0; if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; trace_ctx = tracing_gen_ctx(); - buffer = global_trace.array_buffer.buffer; + buffer = tr->array_buffer.buffer; ring_buffer_nest_start(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, @@ -1140,7 +1169,7 @@ int __trace_bputs(unsigned long ip, const char *str) entry->str = str; __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL); + ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); ret = 1; out: @@ -1176,6 +1205,12 @@ static void tracing_snapshot_instance_cond(struct trace_array *tr, return; } + if (tr->mapped) { + trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n"); + trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); + return; + } + local_irq_save(flags); update_max_tr(tr, current, smp_processor_id(), cond_data); local_irq_restore(flags); @@ -1301,6 +1336,50 @@ static void free_snapshot(struct trace_array *tr) tr->allocated_snapshot = false; } +static int tracing_arm_snapshot_locked(struct trace_array *tr) +{ + int ret; + + lockdep_assert_held(&trace_types_lock); + + spin_lock(&tr->snapshot_trigger_lock); + if (tr->snapshot == UINT_MAX || tr->mapped) { + spin_unlock(&tr->snapshot_trigger_lock); + return -EBUSY; + } + + tr->snapshot++; + spin_unlock(&tr->snapshot_trigger_lock); + + ret = tracing_alloc_snapshot_instance(tr); + if (ret) { + spin_lock(&tr->snapshot_trigger_lock); + tr->snapshot--; + spin_unlock(&tr->snapshot_trigger_lock); + } + + return ret; +} + +int tracing_arm_snapshot(struct trace_array *tr) +{ + int ret; + + mutex_lock(&trace_types_lock); + ret = tracing_arm_snapshot_locked(tr); + mutex_unlock(&trace_types_lock); + + return ret; +} + +void tracing_disarm_snapshot(struct trace_array *tr) +{ + spin_lock(&tr->snapshot_trigger_lock); + if (!WARN_ON(!tr->snapshot)) + tr->snapshot--; + spin_unlock(&tr->snapshot_trigger_lock); +} + /** * tracing_alloc_snapshot - allocate snapshot buffer. * @@ -1362,26 +1441,20 @@ EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update) { - struct cond_snapshot *cond_snapshot; - int ret = 0; + struct cond_snapshot *cond_snapshot __free(kfree) = + kzalloc(sizeof(*cond_snapshot), GFP_KERNEL); + int ret; - cond_snapshot = kzalloc(sizeof(*cond_snapshot), GFP_KERNEL); if (!cond_snapshot) return -ENOMEM; cond_snapshot->cond_data = cond_data; cond_snapshot->update = update; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); - ret = tracing_alloc_snapshot_instance(tr); - if (ret) - goto fail_unlock; - - if (tr->current_trace->use_max_tr) { - ret = -EBUSY; - goto fail_unlock; - } + if (tr->current_trace->use_max_tr) + return -EBUSY; /* * The cond_snapshot can only change to NULL without the @@ -1391,25 +1464,20 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, * do safely with only holding the trace_types_lock and not * having to take the max_lock. */ - if (tr->cond_snapshot) { - ret = -EBUSY; - goto fail_unlock; - } + if (tr->cond_snapshot) + return -EBUSY; + + ret = tracing_arm_snapshot_locked(tr); + if (ret) + return ret; local_irq_disable(); arch_spin_lock(&tr->max_lock); - tr->cond_snapshot = cond_snapshot; + tr->cond_snapshot = no_free_ptr(cond_snapshot); arch_spin_unlock(&tr->max_lock); local_irq_enable(); - mutex_unlock(&trace_types_lock); - - return ret; - - fail_unlock: - mutex_unlock(&trace_types_lock); - kfree(cond_snapshot); - return ret; + return 0; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); @@ -1440,6 +1508,8 @@ int tracing_snapshot_cond_disable(struct trace_array *tr) arch_spin_unlock(&tr->max_lock); local_irq_enable(); + tracing_disarm_snapshot(tr); + return ret; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); @@ -1482,6 +1552,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr) } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); #define free_snapshot(tr) do { } while (0) +#define tracing_arm_snapshot_locked(tr) ({ -EBUSY; }) #endif /* CONFIG_TRACER_SNAPSHOT */ void tracer_tracing_off(struct trace_array *tr) @@ -1839,7 +1910,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; - strncpy(max_data->comm, tsk->comm, TASK_COMM_LEN); + strscpy(max_data->comm, tsk->comm); max_data->pid = tsk->pid; /* * If tsk == current, then use current_uid(), as that does not use @@ -1955,15 +2026,36 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) #endif /* CONFIG_TRACER_MAX_TRACE */ +struct pipe_wait { + struct trace_iterator *iter; + int wait_index; +}; + +static bool wait_pipe_cond(void *data) +{ + struct pipe_wait *pwait = data; + struct trace_iterator *iter = pwait->iter; + + if (atomic_read_acquire(&iter->wait_index) != pwait->wait_index) + return true; + + return iter->closed; +} + static int wait_on_pipe(struct trace_iterator *iter, int full) { + struct pipe_wait pwait; int ret; /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) return 0; - ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full); + pwait.wait_index = atomic_read_acquire(&iter->wait_index); + pwait.iter = iter; + + ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full, + wait_pipe_cond, &pwait); #ifdef CONFIG_TRACER_MAX_TRACE /* @@ -2098,10 +2190,10 @@ static __init int init_trace_selftests(void) selftests_can_run = true; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (list_empty(&postponed_selftests)) - goto out; + return 0; pr_info("Running postponed tracer tests:\n"); @@ -2130,17 +2222,10 @@ static __init int init_trace_selftests(void) } tracing_selftest_running = false; - out: - mutex_unlock(&trace_types_lock); - return 0; } core_initcall(init_trace_selftests); #else -static inline int run_tracer_selftest(struct tracer *type) -{ - return 0; -} static inline int do_run_tracer_selftest(struct tracer *type) { return 0; @@ -2274,6 +2359,25 @@ void tracing_reset_online_cpus(struct array_buffer *buf) ring_buffer_record_enable(buffer); } +static void tracing_reset_all_cpus(struct array_buffer *buf) +{ + struct trace_buffer *buffer = buf->buffer; + + if (!buffer) + return; + + ring_buffer_record_disable(buffer); + + /* Make sure all commits have finished */ + synchronize_rcu(); + + buf->time_start = buffer_ftrace_now(buf, buf->cpu); + + ring_buffer_reset(buffer); + + ring_buffer_record_enable(buffer); +} + /* Must have trace_types_lock held */ void tracing_reset_all_online_cpus_unlocked(void) { @@ -2299,98 +2403,6 @@ void tracing_reset_all_online_cpus(void) mutex_unlock(&trace_types_lock); } -/* - * The tgid_map array maps from pid to tgid; i.e. the value stored at index i - * is the tgid last observed corresponding to pid=i. - */ -static int *tgid_map; - -/* The maximum valid index into tgid_map. */ -static size_t tgid_map_max; - -#define SAVED_CMDLINES_DEFAULT 128 -#define NO_CMDLINE_MAP UINT_MAX -/* - * Preemption must be disabled before acquiring trace_cmdline_lock. - * The various trace_arrays' max_lock must be acquired in a context - * where interrupt is disabled. - */ -static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; -struct saved_cmdlines_buffer { - unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; - unsigned *map_cmdline_to_pid; - unsigned cmdline_num; - int cmdline_idx; - char saved_cmdlines[]; -}; -static struct saved_cmdlines_buffer *savedcmd; - -static inline char *get_saved_cmdlines(int idx) -{ - return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; -} - -static inline void set_cmdline(int idx, const char *cmdline) -{ - strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); -} - -static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) -{ - int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN); - - kfree(s->map_cmdline_to_pid); - kmemleak_free(s); - free_pages((unsigned long)s, order); -} - -static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val) -{ - struct saved_cmdlines_buffer *s; - struct page *page; - int orig_size, size; - int order; - - /* Figure out how much is needed to hold the given number of cmdlines */ - orig_size = sizeof(*s) + val * TASK_COMM_LEN; - order = get_order(orig_size); - size = 1 << (order + PAGE_SHIFT); - page = alloc_pages(GFP_KERNEL, order); - if (!page) - return NULL; - - s = page_address(page); - kmemleak_alloc(s, size, 1, GFP_KERNEL); - memset(s, 0, sizeof(*s)); - - /* Round up to actual allocation */ - val = (size - sizeof(*s)) / TASK_COMM_LEN; - s->cmdline_num = val; - - s->map_cmdline_to_pid = kmalloc_array(val, - sizeof(*s->map_cmdline_to_pid), - GFP_KERNEL); - if (!s->map_cmdline_to_pid) { - free_saved_cmdlines_buffer(s); - return NULL; - } - - s->cmdline_idx = 0; - memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, - sizeof(s->map_pid_to_cmdline)); - memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, - val * sizeof(*s->map_cmdline_to_pid)); - - return s; -} - -static int trace_create_savedcmd(void) -{ - savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT); - - return savedcmd ? 0 : -ENOMEM; -} - int is_tracing_stopped(void) { return global_trace.stop_count; @@ -2483,201 +2495,6 @@ void tracing_stop(void) return tracing_stop_tr(&global_trace); } -static int trace_save_cmdline(struct task_struct *tsk) -{ - unsigned tpid, idx; - - /* treat recording of idle task as a success */ - if (!tsk->pid) - return 1; - - tpid = tsk->pid & (PID_MAX_DEFAULT - 1); - - /* - * It's not the end of the world if we don't get - * the lock, but we also don't want to spin - * nor do we want to disable interrupts, - * so if we miss here, then better luck next time. - * - * This is called within the scheduler and wake up, so interrupts - * had better been disabled and run queue lock been held. - */ - lockdep_assert_preemption_disabled(); - if (!arch_spin_trylock(&trace_cmdline_lock)) - return 0; - - idx = savedcmd->map_pid_to_cmdline[tpid]; - if (idx == NO_CMDLINE_MAP) { - idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; - - savedcmd->map_pid_to_cmdline[tpid] = idx; - savedcmd->cmdline_idx = idx; - } - - savedcmd->map_cmdline_to_pid[idx] = tsk->pid; - set_cmdline(idx, tsk->comm); - - arch_spin_unlock(&trace_cmdline_lock); - - return 1; -} - -static void __trace_find_cmdline(int pid, char comm[]) -{ - unsigned map; - int tpid; - - if (!pid) { - strcpy(comm, "<idle>"); - return; - } - - if (WARN_ON_ONCE(pid < 0)) { - strcpy(comm, "<XXX>"); - return; - } - - tpid = pid & (PID_MAX_DEFAULT - 1); - map = savedcmd->map_pid_to_cmdline[tpid]; - if (map != NO_CMDLINE_MAP) { - tpid = savedcmd->map_cmdline_to_pid[map]; - if (tpid == pid) { - strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); - return; - } - } - strcpy(comm, "<...>"); -} - -void trace_find_cmdline(int pid, char comm[]) -{ - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); - - __trace_find_cmdline(pid, comm); - - arch_spin_unlock(&trace_cmdline_lock); - preempt_enable(); -} - -static int *trace_find_tgid_ptr(int pid) -{ - /* - * Pairs with the smp_store_release in set_tracer_flag() to ensure that - * if we observe a non-NULL tgid_map then we also observe the correct - * tgid_map_max. - */ - int *map = smp_load_acquire(&tgid_map); - - if (unlikely(!map || pid > tgid_map_max)) - return NULL; - - return &map[pid]; -} - -int trace_find_tgid(int pid) -{ - int *ptr = trace_find_tgid_ptr(pid); - - return ptr ? *ptr : 0; -} - -static int trace_save_tgid(struct task_struct *tsk) -{ - int *ptr; - - /* treat recording of idle task as a success */ - if (!tsk->pid) - return 1; - - ptr = trace_find_tgid_ptr(tsk->pid); - if (!ptr) - return 0; - - *ptr = tsk->tgid; - return 1; -} - -static bool tracing_record_taskinfo_skip(int flags) -{ - if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID)))) - return true; - if (!__this_cpu_read(trace_taskinfo_save)) - return true; - return false; -} - -/** - * tracing_record_taskinfo - record the task info of a task - * - * @task: task to record - * @flags: TRACE_RECORD_CMDLINE for recording comm - * TRACE_RECORD_TGID for recording tgid - */ -void tracing_record_taskinfo(struct task_struct *task, int flags) -{ - bool done; - - if (tracing_record_taskinfo_skip(flags)) - return; - - /* - * Record as much task information as possible. If some fail, continue - * to try to record the others. - */ - done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task); - done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task); - - /* If recording any information failed, retry again soon. */ - if (!done) - return; - - __this_cpu_write(trace_taskinfo_save, false); -} - -/** - * tracing_record_taskinfo_sched_switch - record task info for sched_switch - * - * @prev: previous task during sched_switch - * @next: next task during sched_switch - * @flags: TRACE_RECORD_CMDLINE for recording comm - * TRACE_RECORD_TGID for recording tgid - */ -void tracing_record_taskinfo_sched_switch(struct task_struct *prev, - struct task_struct *next, int flags) -{ - bool done; - - if (tracing_record_taskinfo_skip(flags)) - return; - - /* - * Record as much task information as possible. If some fail, continue - * to try to record the others. - */ - done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev); - done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next); - done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev); - done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next); - - /* If recording any information failed, retry again soon. */ - if (!done) - return; - - __this_cpu_write(trace_taskinfo_save, false); -} - -/* Helpers to record a specific task information */ -void tracing_record_cmdline(struct task_struct *task) -{ - tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE); -} - -void tracing_record_tgid(struct task_struct *task) -{ - tracing_record_taskinfo(task, TRACE_RECORD_TGID); -} - /* * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function @@ -2719,6 +2536,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) trace_flags |= TRACE_FLAG_NEED_RESCHED; if (test_preempt_need_resched()) trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; + if (IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY) && tif_test_bit(TIF_NEED_RESCHED_LAZY)) + trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY; return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) | (min_t(unsigned int, migration_disable_value(), 0xf)) << 4; } @@ -2965,14 +2784,14 @@ static void output_printk(struct trace_event_buffer *fbuffer) raw_spin_unlock_irqrestore(&tracepoint_iter_lock, flags); } -int tracepoint_printk_sysctl(struct ctl_table *table, int write, +int tracepoint_printk_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int save_tracepoint_printk; int ret; - mutex_lock(&tracepoint_printk_mutex); + guard(mutex)(&tracepoint_printk_mutex); save_tracepoint_printk = tracepoint_printk; ret = proc_dointvec(table, write, buffer, lenp, ppos); @@ -2985,16 +2804,13 @@ int tracepoint_printk_sysctl(struct ctl_table *table, int write, tracepoint_printk = 0; if (save_tracepoint_printk == tracepoint_printk) - goto out; + return ret; if (tracepoint_printk) static_key_enable(&tracepoint_printk_key.key); else static_key_disable(&tracepoint_printk_key.key); - out: - mutex_unlock(&tracepoint_printk_mutex); - return ret; } @@ -3064,7 +2880,6 @@ void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned int trace_ctx) { - struct trace_event_call *call = &event_function; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; @@ -3077,11 +2892,9 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long entry->ip = ip; entry->parent_ip = parent_ip; - if (!call_filter_check_discard(call, entry, buffer, event)) { - if (static_branch_unlikely(&trace_function_exports_enabled)) - ftrace_exports(event, TRACE_EXPORT_FUNCTION); - __buffer_unlock_commit(buffer, event); - } + if (static_branch_unlikely(&trace_function_exports_enabled)) + ftrace_exports(event, TRACE_EXPORT_FUNCTION); + __buffer_unlock_commit(buffer, event); } #ifdef CONFIG_STACKTRACE @@ -3089,7 +2902,7 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long /* Allow 4 levels of nesting: normal, softirq, irq, NMI */ #define FTRACE_KSTACK_NESTING 4 -#define FTRACE_KSTACK_ENTRIES (PAGE_SIZE / FTRACE_KSTACK_NESTING) +#define FTRACE_KSTACK_ENTRIES (SZ_4K / FTRACE_KSTACK_NESTING) struct ftrace_stack { unsigned long calls[FTRACE_KSTACK_ENTRIES]; @@ -3103,11 +2916,11 @@ struct ftrace_stacks { static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks); static DEFINE_PER_CPU(int, ftrace_stack_reserve); -static void __ftrace_trace_stack(struct trace_buffer *buffer, +static void __ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, unsigned int trace_ctx, int skip, struct pt_regs *regs) { - struct trace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; unsigned int size, nr_entries; struct ftrace_stack *fstack; @@ -3150,6 +2963,20 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, nr_entries = stack_trace_save(fstack->calls, size, skip); } +#ifdef CONFIG_DYNAMIC_FTRACE + /* Mark entry of stack trace as trampoline code */ + if (tr->ops && tr->ops->trampoline) { + unsigned long tramp_start = tr->ops->trampoline; + unsigned long tramp_end = tramp_start + tr->ops->trampoline_size; + unsigned long *calls = fstack->calls; + + for (int i = 0; i < nr_entries; i++) { + if (calls[i] >= tramp_start && calls[i] < tramp_end) + calls[i] = FTRACE_TRAMPOLINE_MARKER; + } + } +#endif + event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, struct_size(entry, caller, nr_entries), trace_ctx); @@ -3161,8 +2988,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, memcpy(&entry->caller, fstack->calls, flex_array_size(entry, caller, nr_entries)); - if (!call_filter_check_discard(call, entry, buffer, event)) - __buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); out: /* Again, don't let gcc optimize things here */ @@ -3180,7 +3006,7 @@ static inline void ftrace_trace_stack(struct trace_array *tr, if (!(tr->trace_flags & TRACE_ITER_STACKTRACE)) return; - __ftrace_trace_stack(buffer, trace_ctx, skip, regs); + __ftrace_trace_stack(tr, buffer, trace_ctx, skip, regs); } void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, @@ -3189,7 +3015,7 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, struct trace_buffer *buffer = tr->array_buffer.buffer; if (rcu_is_watching()) { - __ftrace_trace_stack(buffer, trace_ctx, skip, NULL); + __ftrace_trace_stack(tr, buffer, trace_ctx, skip, NULL); return; } @@ -3206,7 +3032,7 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, return; ct_irq_enter_irqson(); - __ftrace_trace_stack(buffer, trace_ctx, skip, NULL); + __ftrace_trace_stack(tr, buffer, trace_ctx, skip, NULL); ct_irq_exit_irqson(); } @@ -3223,8 +3049,8 @@ void trace_dump_stack(int skip) /* Skip 1 to skip this function. */ skip++; #endif - __ftrace_trace_stack(global_trace.array_buffer.buffer, - tracing_gen_ctx(), skip, NULL); + __ftrace_trace_stack(printk_trace, printk_trace->array_buffer.buffer, + tracing_gen_ctx(), skip, NULL); } EXPORT_SYMBOL_GPL(trace_dump_stack); @@ -3235,7 +3061,6 @@ static void ftrace_trace_userstack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx) { - struct trace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; @@ -3269,8 +3094,7 @@ ftrace_trace_userstack(struct trace_array *tr, memset(&entry->caller, 0, sizeof(entry->caller)); stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES); - if (!call_filter_check_discard(call, entry, buffer, event)) - __buffer_unlock_commit(buffer, event); + __buffer_unlock_commit(buffer, event); out_drop_count: __this_cpu_dec(user_stack_count); @@ -3439,15 +3263,17 @@ static void trace_printk_start_stop_comm(int enabled) */ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { - struct trace_event_call *call = &event_bprint; struct ring_buffer_event *event; struct trace_buffer *buffer; - struct trace_array *tr = &global_trace; + struct trace_array *tr = READ_ONCE(printk_trace); struct bprint_entry *entry; unsigned int trace_ctx; char *tbuffer; int len = 0, size; + if (!printk_binsafe(tr)) + return trace_vprintk(ip, fmt, args); + if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; @@ -3480,10 +3306,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) entry->fmt = fmt; memcpy(entry->buf, tbuffer, sizeof(u32) * len); - if (!call_filter_check_discard(call, entry, buffer, event)) { - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); - } + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); out: ring_buffer_nest_end(buffer); @@ -3503,7 +3327,6 @@ static int __trace_array_vprintk(struct trace_buffer *buffer, unsigned long ip, const char *fmt, va_list args) { - struct trace_event_call *call = &event_print; struct ring_buffer_event *event; int len = 0, size; struct print_entry *entry; @@ -3538,10 +3361,8 @@ __trace_array_vprintk(struct trace_buffer *buffer, entry->ip = ip; memcpy(&entry->buf, tbuffer, len + 1); - if (!call_filter_check_discard(call, entry, buffer, event)) { - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(&global_trace, buffer, trace_ctx, 6, NULL); - } + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); out: ring_buffer_nest_end(buffer); @@ -3636,7 +3457,7 @@ int trace_array_printk_buf(struct trace_buffer *buffer, int ret; va_list ap; - if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) + if (!(printk_trace->trace_flags & TRACE_ITER_PRINTK)) return 0; va_start(ap, fmt); @@ -3648,7 +3469,7 @@ int trace_array_printk_buf(struct trace_buffer *buffer, __printf(2, 0) int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { - return trace_array_vprintk(&global_trace, ip, fmt, args); + return trace_array_vprintk(printk_trace, ip, fmt, args); } EXPORT_SYMBOL_GPL(trace_vprintk); @@ -3771,17 +3592,12 @@ char *trace_iter_expand_format(struct trace_iterator *iter) } /* Returns true if the string is safe to dereference from an event */ -static bool trace_safe_str(struct trace_iterator *iter, const char *str, - bool star, int len) +static bool trace_safe_str(struct trace_iterator *iter, const char *str) { unsigned long addr = (unsigned long)str; struct trace_event *trace_event; struct trace_event_call *event; - /* Ignore strings with no length */ - if (star && !len) - return true; - /* OK if part of the event data */ if ((addr >= (unsigned long)iter->ent) && (addr < (unsigned long)iter->ent + iter->ent_size)) @@ -3821,133 +3637,69 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str, return false; } -static DEFINE_STATIC_KEY_FALSE(trace_no_verify); - -static int test_can_verify_check(const char *fmt, ...) -{ - char buf[16]; - va_list ap; - int ret; - - /* - * The verifier is dependent on vsnprintf() modifies the va_list - * passed to it, where it is sent as a reference. Some architectures - * (like x86_32) passes it by value, which means that vsnprintf() - * does not modify the va_list passed to it, and the verifier - * would then need to be able to understand all the values that - * vsnprintf can use. If it is passed by value, then the verifier - * is disabled. - */ - va_start(ap, fmt); - vsnprintf(buf, 16, "%d", ap); - ret = va_arg(ap, int); - va_end(ap); - - return ret; -} - -static void test_can_verify(void) -{ - if (!test_can_verify_check("%d %d", 0, 1)) { - pr_info("trace event string verifier disabled\n"); - static_branch_inc(&trace_no_verify); - } -} - /** - * trace_check_vprintf - Check dereferenced strings while writing to the seq buffer + * ignore_event - Check dereferenced fields while writing to the seq buffer * @iter: The iterator that holds the seq buffer and the event being printed - * @fmt: The format used to print the event - * @ap: The va_list holding the data to print from @fmt. * - * This writes the data into the @iter->seq buffer using the data from - * @fmt and @ap. If the format has a %s, then the source of the string - * is examined to make sure it is safe to print, otherwise it will - * warn and print "[UNSAFE MEMORY]" in place of the dereferenced string - * pointer. + * At boot up, test_event_printk() will flag any event that dereferences + * a string with "%s" that does exist in the ring buffer. It may still + * be valid, as the string may point to a static string in the kernel + * rodata that never gets freed. But if the string pointer is pointing + * to something that was allocated, there's a chance that it can be freed + * by the time the user reads the trace. This would cause a bad memory + * access by the kernel and possibly crash the system. + * + * This function will check if the event has any fields flagged as needing + * to be checked at runtime and perform those checks. + * + * If it is found that a field is unsafe, it will write into the @iter->seq + * a message stating what was found to be unsafe. + * + * @return: true if the event is unsafe and should be ignored, + * false otherwise. */ -void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, - va_list ap) +bool ignore_event(struct trace_iterator *iter) { - const char *p = fmt; - const char *str; - int i, j; + struct ftrace_event_field *field; + struct trace_event *trace_event; + struct trace_event_call *event; + struct list_head *head; + struct trace_seq *seq; + const void *ptr; - if (WARN_ON_ONCE(!fmt)) - return; + trace_event = ftrace_find_event(iter->ent->type); - if (static_branch_unlikely(&trace_no_verify)) - goto print; + seq = &iter->seq; - /* Don't bother checking when doing a ftrace_dump() */ - if (iter->fmt == static_fmt_buf) - goto print; + if (!trace_event) { + trace_seq_printf(seq, "EVENT ID %d NOT FOUND?\n", iter->ent->type); + return true; + } - while (*p) { - bool star = false; - int len = 0; - - j = 0; - - /* We only care about %s and variants */ - for (i = 0; p[i]; i++) { - if (i + 1 >= iter->fmt_size) { - /* - * If we can't expand the copy buffer, - * just print it. - */ - if (!trace_iter_expand_format(iter)) - goto print; - } + event = container_of(trace_event, struct trace_event_call, event); + if (!(event->flags & TRACE_EVENT_FL_TEST_STR)) + return false; - if (p[i] == '\\' && p[i+1]) { - i++; - continue; - } - if (p[i] == '%') { - /* Need to test cases like %08.*s */ - for (j = 1; p[i+j]; j++) { - if (isdigit(p[i+j]) || - p[i+j] == '.') - continue; - if (p[i+j] == '*') { - star = true; - continue; - } - break; - } - if (p[i+j] == 's') - break; - star = false; - } - j = 0; - } - /* If no %s found then just print normally */ - if (!p[i]) - break; + head = trace_get_fields(event); + if (!head) { + trace_seq_printf(seq, "FIELDS FOR EVENT '%s' NOT FOUND?\n", + trace_event_name(event)); + return true; + } - /* Copy up to the %s, and print that */ - strncpy(iter->fmt, p, i); - iter->fmt[i] = '\0'; - trace_seq_vprintf(&iter->seq, iter->fmt, ap); + /* Offsets are from the iter->ent that points to the raw event */ + ptr = iter->ent; - /* - * If iter->seq is full, the above call no longer guarantees - * that ap is in sync with fmt processing, and further calls - * to va_arg() can return wrong positional arguments. - * - * Ensure that ap is no longer used in this case. - */ - if (iter->seq.full) { - p = ""; - break; - } + list_for_each_entry(field, head, link) { + const char *str; + bool good; - if (star) - len = va_arg(ap, int); + if (!field->needs_test) + continue; + + str = *(const char **)(ptr + field->offset); - /* The ap now points to the string data of the %s */ - str = va_arg(ap, const char *); + good = trace_safe_str(iter, str); /* * If you hit this warning, it is likely that the @@ -3958,45 +3710,14 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, * instead. See samples/trace_events/trace-events-sample.h * for reference. */ - if (WARN_ONCE(!trace_safe_str(iter, str, star, len), - "fmt: '%s' current_buffer: '%s'", - fmt, seq_buf_str(&iter->seq.seq))) { - int ret; - - /* Try to safely read the string */ - if (star) { - if (len + 1 > iter->fmt_size) - len = iter->fmt_size - 1; - if (len < 0) - len = 0; - ret = copy_from_kernel_nofault(iter->fmt, str, len); - iter->fmt[len] = 0; - star = false; - } else { - ret = strncpy_from_kernel_nofault(iter->fmt, str, - iter->fmt_size); - } - if (ret < 0) - trace_seq_printf(&iter->seq, "(0x%px)", str); - else - trace_seq_printf(&iter->seq, "(0x%px:%s)", - str, iter->fmt); - str = "[UNSAFE-MEMORY]"; - strcpy(iter->fmt, "%s"); - } else { - strncpy(iter->fmt, p + i, j + 1); - iter->fmt[j+1] = '\0'; + if (WARN_ONCE(!good, "event '%s' has unsafe pointer field '%s'", + trace_event_name(event), field->name)) { + trace_seq_printf(seq, "EVENT %s: HAS UNSAFE POINTER FIELD '%s'\n", + trace_event_name(event), field->name); + return true; } - if (star) - trace_seq_printf(&iter->seq, iter->fmt, len, str); - else - trace_seq_printf(&iter->seq, iter->fmt, str); - - p += i + j + 1; } - print: - if (*p) - trace_seq_vprintf(&iter->seq, p, ap); + return false; } const char *trace_event_format(struct trace_iterator *iter, const char *fmt) @@ -4156,6 +3877,8 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) break; entries++; ring_buffer_iter_advance(buf_iter); + /* This could be a big loop */ + cond_resched(); } per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries; @@ -4368,7 +4091,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) get_total_entries(buf, &total, &entries); seq_printf(m, "# %s latency trace v1.1.5 on %s\n", - name, UTS_RELEASE); + name, init_utsname()->release); seq_puts(m, "# -----------------------------------" "---------------------------------\n"); seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |" @@ -4380,6 +4103,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) preempt_model_none() ? "server" : preempt_model_voluntary() ? "desktop" : preempt_model_full() ? "preempt" : + preempt_model_lazy() ? "lazy" : preempt_model_rt() ? "preempt_rt" : "unknown", /* These are reserved for later use */ @@ -4464,6 +4188,15 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) if (event) { if (tr->trace_flags & TRACE_ITER_FIELDS) return print_event_fields(iter, event); + /* + * For TRACE_EVENT() events, the print_fmt is not + * safe to use if the array has delta offsets + * Force printing via the fields. + */ + if ((tr->text_delta || tr->data_delta) && + event->type > __TRACE_LAST_TYPE) + return print_event_fields(iter, event); + return event->funcs->trace(iter, sym_flags, event); } @@ -5119,6 +4852,11 @@ static int tracing_open(struct inode *inode, struct file *file) static bool trace_ok_for_array(struct tracer *t, struct trace_array *tr) { +#ifdef CONFIG_TRACER_SNAPSHOT + /* arrays with mapped buffer range do not have snapshots */ + if (tr->range_addr_start && t->use_max_tr) + return false; +#endif return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances; } @@ -5211,7 +4949,7 @@ static int show_traces_open(struct inode *inode, struct file *file) return 0; } -static int show_traces_release(struct inode *inode, struct file *file) +static int tracing_seq_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; @@ -5252,7 +4990,7 @@ static const struct file_operations show_traces_fops = { .open = show_traces_open, .read = seq_read, .llseek = seq_lseek, - .release = show_traces_release, + .release = tracing_seq_release, }; static ssize_t @@ -5331,6 +5069,9 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, cpumask_var_t tracing_cpumask_new; int err; + if (count == 0 || count > KMALLOC_MAX_SIZE) + return -EINVAL; + if (!zalloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) return -ENOMEM; @@ -5367,7 +5108,8 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) u32 tracer_flags; int i; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); + tracer_flags = tr->current_trace->flags->val; trace_opts = tr->current_trace->flags->opts; @@ -5384,7 +5126,6 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) else seq_printf(m, "no%s\n", trace_opts[i].name); } - mutex_unlock(&trace_types_lock); return 0; } @@ -5436,10 +5177,9 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) { - int *map; - if ((mask == TRACE_ITER_RECORD_TGID) || - (mask == TRACE_ITER_RECORD_CMD)) + (mask == TRACE_ITER_RECORD_CMD) || + (mask == TRACE_ITER_TRACE_PRINTK)) lockdep_assert_held(&event_mutex); /* do nothing if flag is already set */ @@ -5451,6 +5191,25 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) if (tr->current_trace->flag_changed(tr, mask, !!enabled)) return -EINVAL; + if (mask == TRACE_ITER_TRACE_PRINTK) { + if (enabled) { + update_printk_trace(tr); + } else { + /* + * The global_trace cannot clear this. + * It's flag only gets cleared if another instance sets it. + */ + if (printk_trace == &global_trace) + return -EINVAL; + /* + * An instance must always have it set. + * by default, that's the global_trace instane. + */ + if (printk_trace == tr) + update_printk_trace(&global_trace); + } + } + if (enabled) tr->trace_flags |= mask; else @@ -5460,20 +5219,8 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) trace_event_enable_cmd_record(enabled); if (mask == TRACE_ITER_RECORD_TGID) { - if (!tgid_map) { - tgid_map_max = pid_max; - map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map), - GFP_KERNEL); - /* - * Pairs with smp_load_acquire() in - * trace_find_tgid_ptr() to ensure that if it observes - * the tgid_map we just allocated then it also observes - * the corresponding tgid_map_max value. - */ - smp_store_release(&tgid_map, map); - } - if (!tgid_map) { + if (trace_alloc_tgid_map() < 0) { tr->trace_flags &= ~TRACE_ITER_RECORD_TGID; return -ENOMEM; } @@ -5613,6 +5360,10 @@ static const struct file_operations tracing_iter_fops = { static const char readme_msg[] = "tracing mini-HOWTO:\n\n" + "By default tracefs removes all OTH file permission bits.\n" + "When mounting tracefs an optional group id can be specified\n" + "which adds the group to every directory and file in tracefs:\n\n" + "\t e.g. mount -t tracefs [-o [gid=<gid>]] nodev /sys/kernel/tracing\n\n" "# echo 0 > tracing_on : quick way to disable tracing\n" "# echo 1 > tracing_on : quick way to re-enable tracing\n\n" " Important files:\n" @@ -5747,19 +5498,18 @@ static const char readme_msg[] = "\t args: <name>=fetcharg[:type]\n" "\t fetcharg: (%<register>|$<efield>), @<address>, @<symbol>[+|-<offset>],\n" #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API -#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n" +#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS "\t <argname>[->field[->field|.field...]],\n" -#else - "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n" #endif #else "\t $stack<index>, $stack, $retval, $comm,\n" #endif "\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n" + "\t kernel return probes support: $retval, $arg<N>, $comm\n" "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n" "\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n" - "\t symstr, <type>\\[<array-size>\\]\n" + "\t symstr, %pd/%pD, <type>\\[<array-size>\\]\n" #ifdef CONFIG_HIST_TRIGGERS "\t field: <stype> <name>;\n" "\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n" @@ -5768,6 +5518,8 @@ static const char readme_msg[] = "\t efield: For event probes ('e' types), the field is on of the fields\n" "\t of the <attached-group>/<attached-event>.\n" #endif + " set_event\t\t- Enables events by name written into it\n" + "\t\t\t Can enable module events via: :mod:<module>\n" " events/\t\t- Directory containing all trace event subsystems:\n" " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" " events/<system>/\t- Directory containing all trace events for <system>:\n" @@ -5918,207 +5670,6 @@ static const struct file_operations tracing_readme_fops = { .llseek = generic_file_llseek, }; -static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos) -{ - int pid = ++(*pos); - - return trace_find_tgid_ptr(pid); -} - -static void *saved_tgids_start(struct seq_file *m, loff_t *pos) -{ - int pid = *pos; - - return trace_find_tgid_ptr(pid); -} - -static void saved_tgids_stop(struct seq_file *m, void *v) -{ -} - -static int saved_tgids_show(struct seq_file *m, void *v) -{ - int *entry = (int *)v; - int pid = entry - tgid_map; - int tgid = *entry; - - if (tgid == 0) - return SEQ_SKIP; - - seq_printf(m, "%d %d\n", pid, tgid); - return 0; -} - -static const struct seq_operations tracing_saved_tgids_seq_ops = { - .start = saved_tgids_start, - .stop = saved_tgids_stop, - .next = saved_tgids_next, - .show = saved_tgids_show, -}; - -static int tracing_saved_tgids_open(struct inode *inode, struct file *filp) -{ - int ret; - - ret = tracing_check_open_get_tr(NULL); - if (ret) - return ret; - - return seq_open(filp, &tracing_saved_tgids_seq_ops); -} - - -static const struct file_operations tracing_saved_tgids_fops = { - .open = tracing_saved_tgids_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) -{ - unsigned int *ptr = v; - - if (*pos || m->count) - ptr++; - - (*pos)++; - - for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num]; - ptr++) { - if (*ptr == -1 || *ptr == NO_CMDLINE_MAP) - continue; - - return ptr; - } - - return NULL; -} - -static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) -{ - void *v; - loff_t l = 0; - - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); - - v = &savedcmd->map_cmdline_to_pid[0]; - while (l <= *pos) { - v = saved_cmdlines_next(m, v, &l); - if (!v) - return NULL; - } - - return v; -} - -static void saved_cmdlines_stop(struct seq_file *m, void *v) -{ - arch_spin_unlock(&trace_cmdline_lock); - preempt_enable(); -} - -static int saved_cmdlines_show(struct seq_file *m, void *v) -{ - char buf[TASK_COMM_LEN]; - unsigned int *pid = v; - - __trace_find_cmdline(*pid, buf); - seq_printf(m, "%d %s\n", *pid, buf); - return 0; -} - -static const struct seq_operations tracing_saved_cmdlines_seq_ops = { - .start = saved_cmdlines_start, - .next = saved_cmdlines_next, - .stop = saved_cmdlines_stop, - .show = saved_cmdlines_show, -}; - -static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) -{ - int ret; - - ret = tracing_check_open_get_tr(NULL); - if (ret) - return ret; - - return seq_open(filp, &tracing_saved_cmdlines_seq_ops); -} - -static const struct file_operations tracing_saved_cmdlines_fops = { - .open = tracing_saved_cmdlines_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static ssize_t -tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - int r; - - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); - r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); - arch_spin_unlock(&trace_cmdline_lock); - preempt_enable(); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static int tracing_resize_saved_cmdlines(unsigned int val) -{ - struct saved_cmdlines_buffer *s, *savedcmd_temp; - - s = allocate_cmdlines_buffer(val); - if (!s) - return -ENOMEM; - - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); - savedcmd_temp = savedcmd; - savedcmd = s; - arch_spin_unlock(&trace_cmdline_lock); - preempt_enable(); - free_saved_cmdlines_buffer(savedcmd_temp); - - return 0; -} - -static ssize_t -tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - unsigned long val; - int ret; - - ret = kstrtoul_from_user(ubuf, cnt, 10, &val); - if (ret) - return ret; - - /* must have at least 1 entry or less than PID_MAX_DEFAULT */ - if (!val || val > PID_MAX_DEFAULT) - return -EINVAL; - - ret = tracing_resize_saved_cmdlines((unsigned int)val); - if (ret < 0) - return ret; - - *ppos += cnt; - - return cnt; -} - -static const struct file_operations tracing_saved_cmdlines_size_fops = { - .open = tracing_open_generic, - .read = tracing_saved_cmdlines_size_read, - .write = tracing_saved_cmdlines_size_write, -}; - #ifdef CONFIG_TRACE_EVAL_MAP_FILE static union trace_eval_map_item * update_eval_map(union trace_eval_map_item *ptr) @@ -6241,7 +5792,7 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, return; } - mutex_lock(&trace_eval_mutex); + guard(mutex)(&trace_eval_mutex); if (!trace_eval_maps) trace_eval_maps = map_array; @@ -6265,8 +5816,6 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, map_array++; } memset(map_array, 0, sizeof(*map_array)); - - mutex_unlock(&trace_eval_mutex); } static void trace_create_eval_file(struct dentry *d_tracer) @@ -6428,28 +5977,34 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, ssize_t tracing_resize_ring_buffer(struct trace_array *tr, unsigned long size, int cpu_id) { - int ret; - - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (cpu_id != RING_BUFFER_ALL_CPUS) { /* make sure, this cpu is enabled in the mask */ - if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) { - ret = -EINVAL; - goto out; - } + if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) + return -EINVAL; } - ret = __tracing_resize_ring_buffer(tr, size, cpu_id); - if (ret < 0) - ret = -ENOMEM; + return __tracing_resize_ring_buffer(tr, size, cpu_id); +} -out: - mutex_unlock(&trace_types_lock); +static void update_last_data(struct trace_array *tr) +{ + if (!tr->text_delta && !tr->data_delta) + return; - return ret; -} + /* + * Need to clear all CPU buffers as there cannot be events + * from the previous boot mixed with events with this boot + * as that will cause a confusing trace. Need to clear all + * CPU buffers, even for those that may currently be offline. + */ + tracing_reset_all_cpus(&tr->array_buffer); + /* Using current data now */ + tr->text_delta = 0; + tr->data_delta = 0; +} /** * tracing_update_buffers - used by tracing facility to expand ring buffers @@ -6467,6 +6022,9 @@ int tracing_update_buffers(struct trace_array *tr) int ret = 0; mutex_lock(&trace_types_lock); + + update_last_data(tr); + if (!tr->ring_buffer_expanded) ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); @@ -6488,7 +6046,7 @@ static void tracing_set_nop(struct trace_array *tr) { if (tr->current_trace == &nop_trace) return; - + tr->current_trace->enabled--; if (tr->current_trace->reset) @@ -6518,15 +6076,17 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) #ifdef CONFIG_TRACER_MAX_TRACE bool had_max_tr; #endif - int ret = 0; + int ret; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); + + update_last_data(tr); if (!tr->ring_buffer_expanded) { ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); if (ret < 0) - goto out; + return ret; ret = 0; } @@ -6534,43 +6094,37 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) if (strcmp(t->name, buf) == 0) break; } - if (!t) { - ret = -EINVAL; - goto out; - } + if (!t) + return -EINVAL; + if (t == tr->current_trace) - goto out; + return 0; #ifdef CONFIG_TRACER_SNAPSHOT if (t->use_max_tr) { local_irq_disable(); arch_spin_lock(&tr->max_lock); - if (tr->cond_snapshot) - ret = -EBUSY; + ret = tr->cond_snapshot ? -EBUSY : 0; arch_spin_unlock(&tr->max_lock); local_irq_enable(); if (ret) - goto out; + return ret; } #endif /* Some tracers won't work on kernel command line */ if (system_state < SYSTEM_RUNNING && t->noboot) { pr_warn("Tracer '%s' is not allowed on command line, ignored\n", t->name); - goto out; + return -EINVAL; } /* Some tracers are only allowed for the top level buffer */ - if (!trace_ok_for_array(t, tr)) { - ret = -EINVAL; - goto out; - } + if (!trace_ok_for_array(t, tr)) + return -EINVAL; /* If trace pipe files are being read, we can't change the tracer */ - if (tr->trace_ref) { - ret = -EBUSY; - goto out; - } + if (tr->trace_ref) + return -EBUSY; trace_branch_disable(); @@ -6595,12 +6149,13 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) */ synchronize_rcu(); free_snapshot(tr); + tracing_disarm_snapshot(tr); } - if (t->use_max_tr && !tr->allocated_snapshot) { - ret = tracing_alloc_snapshot_instance(tr); - if (ret < 0) - goto out; + if (!had_max_tr && t->use_max_tr) { + ret = tracing_arm_snapshot_locked(tr); + if (ret) + return ret; } #else tr->current_trace = &nop_trace; @@ -6608,17 +6163,20 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) if (t->init) { ret = tracer_init(t, tr); - if (ret) - goto out; + if (ret) { +#ifdef CONFIG_TRACER_MAX_TRACE + if (t->use_max_tr) + tracing_disarm_snapshot(tr); +#endif + return ret; + } } tr->current_trace = t; tr->current_trace->enabled++; trace_branch_enable(tr); - out: - mutex_unlock(&trace_types_lock); - return ret; + return 0; } static ssize_t @@ -6696,22 +6254,18 @@ tracing_thresh_write(struct file *filp, const char __user *ubuf, struct trace_array *tr = filp->private_data; int ret; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos); if (ret < 0) - goto out; + return ret; if (tr->current_trace->update_thresh) { ret = tr->current_trace->update_thresh(tr); if (ret < 0) - goto out; + return ret; } - ret = cnt; -out: - mutex_unlock(&trace_types_lock); - - return ret; + return cnt; } #ifdef CONFIG_TRACER_MAX_TRACE @@ -6930,31 +6484,29 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, * This is just a matter of traces coherency, the ring buffer itself * is protected. */ - mutex_lock(&iter->mutex); + guard(mutex)(&iter->mutex); /* return any leftover data */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); if (sret != -EBUSY) - goto out; + return sret; trace_seq_init(&iter->seq); if (iter->trace->read) { sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); if (sret) - goto out; + return sret; } waitagain: sret = tracing_wait_pipe(filp); if (sret <= 0) - goto out; + return sret; /* stop when tracing is finished */ - if (trace_empty(iter)) { - sret = 0; - goto out; - } + if (trace_empty(iter)) + return 0; if (cnt >= TRACE_SEQ_BUFFER_SIZE) cnt = TRACE_SEQ_BUFFER_SIZE - 1; @@ -7018,9 +6570,6 @@ waitagain: if (sret == -EBUSY) goto waitagain; -out: - mutex_unlock(&iter->mutex); - return sret; } @@ -7264,6 +6813,37 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, } static ssize_t +tracing_last_boot_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + struct seq_buf seq; + char buf[64]; + + seq_buf_init(&seq, buf, 64); + + seq_buf_printf(&seq, "text delta:\t%ld\n", tr->text_delta); + seq_buf_printf(&seq, "data delta:\t%ld\n", tr->data_delta); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, seq_buf_used(&seq)); +} + +static int tracing_buffer_meta_open(struct inode *inode, struct file *filp) +{ + struct trace_array *tr = inode->i_private; + int cpu = tracing_get_cpu(inode); + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + ret = ring_buffer_meta_seq_init(filp, tr->array_buffer.buffer, cpu); + if (ret < 0) + __trace_array_put(tr); + return ret; +} + +static ssize_t tracing_free_buffer_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -7581,25 +7161,19 @@ u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_eve */ int tracing_set_filter_buffering(struct trace_array *tr, bool set) { - int ret = 0; - - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (set && tr->no_filter_buffering_ref++) - goto out; + return 0; if (!set) { - if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) { - ret = -EINVAL; - goto out; - } + if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) + return -EINVAL; --tr->no_filter_buffering_ref; } - out: - mutex_unlock(&trace_types_lock); - return ret; + return 0; } struct ftrace_buffer_info { @@ -7675,12 +7249,10 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret) return ret; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); - if (tr->current_trace->use_max_tr) { - ret = -EBUSY; - goto out; - } + if (tr->current_trace->use_max_tr) + return -EBUSY; local_irq_disable(); arch_spin_lock(&tr->max_lock); @@ -7689,32 +7261,29 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, arch_spin_unlock(&tr->max_lock); local_irq_enable(); if (ret) - goto out; + return ret; switch (val) { case 0: - if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { - ret = -EINVAL; - break; - } + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) + return -EINVAL; if (tr->allocated_snapshot) free_snapshot(tr); break; case 1: /* Only allow per-cpu swap if the ring buffer supports it */ #ifndef CONFIG_RING_BUFFER_ALLOW_SWAP - if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { - ret = -EINVAL; - break; - } + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) + return -EINVAL; #endif if (tr->allocated_snapshot) ret = resize_buffer_duplicate_size(&tr->max_buffer, &tr->array_buffer, iter->cpu_file); - else - ret = tracing_alloc_snapshot_instance(tr); - if (ret < 0) - break; + + ret = tracing_arm_snapshot_locked(tr); + if (ret) + return ret; + /* Now, we're going to swap */ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { local_irq_disable(); @@ -7724,6 +7293,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer, (void *)tr, 1); } + tracing_disarm_snapshot(tr); break; default: if (tr->allocated_snapshot) { @@ -7739,8 +7309,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, *ppos += cnt; ret = cnt; } -out: - mutex_unlock(&trace_types_lock); + return ret; } @@ -7826,7 +7395,6 @@ static const struct file_operations tracing_pipe_fops = { .read = tracing_read_pipe, .splice_read = tracing_splice_read_pipe, .release = tracing_release_pipe, - .llseek = no_llseek, }; static const struct file_operations tracing_entries_fops = { @@ -7837,6 +7405,13 @@ static const struct file_operations tracing_entries_fops = { .release = tracing_release_generic_tr, }; +static const struct file_operations tracing_buffer_meta_fops = { + .open = tracing_buffer_meta_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_seq_release, +}; + static const struct file_operations tracing_total_entries_fops = { .open = tracing_open_generic_tr, .read = tracing_total_entries_read, @@ -7877,6 +7452,13 @@ static const struct file_operations trace_time_stamp_mode_fops = { .release = tracing_single_release_tr, }; +static const struct file_operations last_boot_fops = { + .open = tracing_open_generic_tr, + .read = tracing_last_boot_read, + .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, +}; + #ifdef CONFIG_TRACER_SNAPSHOT static const struct file_operations snapshot_fops = { .open = tracing_snapshot_open, @@ -7891,7 +7473,6 @@ static const struct file_operations snapshot_raw_fops = { .read = tracing_buffers_read, .release = tracing_buffers_release, .splice_read = tracing_buffers_splice_read, - .llseek = no_llseek, }; #endif /* CONFIG_TRACER_SNAPSHOT */ @@ -8114,12 +7695,11 @@ void tracing_log_err(struct trace_array *tr, len += sizeof(CMD_PREFIX) + 2 * sizeof("\n") + strlen(cmd) + 1; - mutex_lock(&tracing_err_log_lock); + guard(mutex)(&tracing_err_log_lock); + err = get_tracing_log_err(tr, len); - if (PTR_ERR(err) == -ENOMEM) { - mutex_unlock(&tracing_err_log_lock); + if (PTR_ERR(err) == -ENOMEM) return; - } snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc); snprintf(err->cmd, len, "\n" CMD_PREFIX "%s\n", cmd); @@ -8130,7 +7710,6 @@ void tracing_log_err(struct trace_array *tr, err->info.ts = local_clock(); list_add_tail(&err->list, &tr->err_log); - mutex_unlock(&tracing_err_log_lock); } static void clear_tracing_err_log(struct trace_array *tr) @@ -8362,7 +7941,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, trace_access_unlock(iter->cpu_file); if (ret < 0) { - if (trace_empty(iter)) { + if (trace_empty(iter) && !iter->closed) { if ((filp->f_flags & O_NONBLOCK)) return -EAGAIN; @@ -8398,9 +7977,9 @@ static int tracing_buffers_flush(struct file *file, fl_owner_t id) struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; - iter->wait_index++; + iter->closed = true; /* Make sure the waiters see the new wait_index */ - smp_wmb(); + (void)atomic_fetch_inc_release(&iter->wait_index); ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); @@ -8500,6 +8079,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, .spd_release = buffer_spd_release, }; struct buffer_ref *ref; + bool woken = false; int page_size; int entries, i; ssize_t ret = 0; @@ -8573,17 +8153,17 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, /* did we read anything? */ if (!spd.nr_pages) { - long wait_index; if (ret) goto out; + if (woken) + goto out; + ret = -EAGAIN; if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) goto out; - wait_index = READ_ONCE(iter->wait_index); - ret = wait_on_pipe(iter, iter->snapshot ? 0 : iter->tr->buffer_percent); if (ret) goto out; @@ -8592,10 +8172,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, if (!tracer_tracing_is_on(iter->tr)) goto out; - /* Make sure we see the new wait_index */ - smp_rmb(); - if (wait_index != iter->wait_index) - goto out; + /* Iterate one more time to collect any new data then exit */ + woken = true; goto again; } @@ -8607,20 +8185,36 @@ out: return ret; } -/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; + int err; + + if (cmd == TRACE_MMAP_IOCTL_GET_READER) { + if (!(file->f_flags & O_NONBLOCK)) { + err = ring_buffer_wait(iter->array_buffer->buffer, + iter->cpu_file, + iter->tr->buffer_percent, + NULL, NULL); + if (err) + return err; + } - if (cmd) - return -ENOIOCTLCMD; + return ring_buffer_map_get_reader(iter->array_buffer->buffer, + iter->cpu_file); + } else if (cmd) { + return -ENOTTY; + } + /* + * An ioctl call with cmd 0 to the ring buffer file will wake up all + * waiters + */ mutex_lock(&trace_types_lock); - iter->wait_index++; /* Make sure the waiters see the new wait_index */ - smp_wmb(); + (void)atomic_fetch_inc_release(&iter->wait_index); ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); @@ -8628,6 +8222,80 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned return 0; } +#ifdef CONFIG_TRACER_MAX_TRACE +static int get_snapshot_map(struct trace_array *tr) +{ + int err = 0; + + /* + * Called with mmap_lock held. lockdep would be unhappy if we would now + * take trace_types_lock. Instead use the specific + * snapshot_trigger_lock. + */ + spin_lock(&tr->snapshot_trigger_lock); + + if (tr->snapshot || tr->mapped == UINT_MAX) + err = -EBUSY; + else + tr->mapped++; + + spin_unlock(&tr->snapshot_trigger_lock); + + /* Wait for update_max_tr() to observe iter->tr->mapped */ + if (tr->mapped == 1) + synchronize_rcu(); + + return err; + +} +static void put_snapshot_map(struct trace_array *tr) +{ + spin_lock(&tr->snapshot_trigger_lock); + if (!WARN_ON(!tr->mapped)) + tr->mapped--; + spin_unlock(&tr->snapshot_trigger_lock); +} +#else +static inline int get_snapshot_map(struct trace_array *tr) { return 0; } +static inline void put_snapshot_map(struct trace_array *tr) { } +#endif + +static void tracing_buffers_mmap_close(struct vm_area_struct *vma) +{ + struct ftrace_buffer_info *info = vma->vm_file->private_data; + struct trace_iterator *iter = &info->iter; + + WARN_ON(ring_buffer_unmap(iter->array_buffer->buffer, iter->cpu_file)); + put_snapshot_map(iter->tr); +} + +static const struct vm_operations_struct tracing_buffers_vmops = { + .close = tracing_buffers_mmap_close, +}; + +static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct ftrace_buffer_info *info = filp->private_data; + struct trace_iterator *iter = &info->iter; + int ret = 0; + + /* Currently the boot mapped buffer is not supported for mmap */ + if (iter->tr->flags & TRACE_ARRAY_FL_BOOT) + return -ENODEV; + + ret = get_snapshot_map(iter->tr); + if (ret) + return ret; + + ret = ring_buffer_map(iter->array_buffer->buffer, iter->cpu_file, vma); + if (ret) + put_snapshot_map(iter->tr); + + vma->vm_ops = &tracing_buffers_vmops; + + return ret; +} + static const struct file_operations tracing_buffers_fops = { .open = tracing_buffers_open, .read = tracing_buffers_read, @@ -8636,7 +8304,7 @@ static const struct file_operations tracing_buffers_fops = { .flush = tracing_buffers_flush, .splice_read = tracing_buffers_splice_read, .unlocked_ioctl = tracing_buffers_ioctl, - .llseek = no_llseek, + .mmap = tracing_buffers_mmap, }; static ssize_t @@ -8720,15 +8388,22 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf, char *buf; int r; - /* 256 should be plenty to hold the amount needed */ - buf = kmalloc(256, GFP_KERNEL); + /* 512 should be plenty to hold the amount needed */ +#define DYN_INFO_BUF_SIZE 512 + + buf = kmalloc(DYN_INFO_BUF_SIZE, GFP_KERNEL); if (!buf) return -ENOMEM; - r = scnprintf(buf, 256, "%ld pages:%ld groups: %ld\n", + r = scnprintf(buf, DYN_INFO_BUF_SIZE, + "%ld pages:%ld groups: %ld\n" + "ftrace boot update time = %llu (ns)\n" + "ftrace module total update time = %llu (ns)\n", ftrace_update_tot_cnt, ftrace_number_of_pages, - ftrace_number_of_groups); + ftrace_number_of_groups, + ftrace_update_time, + ftrace_total_mod_time); ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); kfree(buf); @@ -8857,8 +8532,13 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; - if (glob[0] == '!') - return unregister_ftrace_function_probe_func(glob+1, tr, ops); + if (glob[0] == '!') { + ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); + if (!ret) + tracing_disarm_snapshot(tr); + + return ret; + } if (!param) goto out_reg; @@ -8877,12 +8557,13 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, return ret; out_reg: - ret = tracing_alloc_snapshot_instance(tr); + ret = tracing_arm_snapshot(tr); if (ret < 0) goto out; ret = register_ftrace_function_probe(glob, tr, ops, count); - + if (ret < 0) + tracing_disarm_snapshot(tr); out: return ret < 0 ? ret : 0; } @@ -8977,12 +8658,17 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) trace_create_cpu_file("buffer_size_kb", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_entries_fops); + if (tr->range_addr_start) + trace_create_cpu_file("buffer_meta", TRACE_MODE_READ, d_cpu, + tr, cpu, &tracing_buffer_meta_fops); #ifdef CONFIG_TRACER_SNAPSHOT - trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu, - tr, cpu, &snapshot_fops); + if (!tr->range_addr_start) { + trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu, + tr, cpu, &snapshot_fops); - trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu, - tr, cpu, &snapshot_raw_fops); + trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu, + tr, cpu, &snapshot_raw_fops); + } #endif } @@ -9519,7 +9205,21 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size buf->tr = tr; - buf->buffer = ring_buffer_alloc(size, rb_flags); + if (tr->range_addr_start && tr->range_addr_size) { + buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0, + tr->range_addr_start, + tr->range_addr_size); + + ring_buffer_last_boot_delta(buf->buffer, + &tr->text_delta, &tr->data_delta); + /* + * This is basically the same as a mapped buffer, + * with the same restrictions. + */ + tr->mapped++; + } else { + buf->buffer = ring_buffer_alloc(size, rb_flags); + } if (!buf->buffer) return -ENOMEM; @@ -9556,6 +9256,10 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) return ret; #ifdef CONFIG_TRACER_MAX_TRACE + /* Fix mapped buffer trace arrays do not have snapshot buffers */ + if (tr->range_addr_start) + return 0; + ret = allocate_trace_buffer(tr, &tr->max_buffer, allocate_snapshot ? size : 1); if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) { @@ -9656,7 +9360,9 @@ static int trace_array_create_dir(struct trace_array *tr) } static struct trace_array * -trace_array_create_systems(const char *name, const char *systems) +trace_array_create_systems(const char *name, const char *systems, + unsigned long range_addr_start, + unsigned long range_addr_size) { struct trace_array *tr; int ret; @@ -9682,6 +9388,10 @@ trace_array_create_systems(const char *name, const char *systems) goto out_free_tr; } + /* Only for boot up memory mapped ring buffers */ + tr->range_addr_start = range_addr_start; + tr->range_addr_size = range_addr_size; + tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; cpumask_copy(tr->tracing_cpumask, cpu_all_mask); @@ -9689,7 +9399,9 @@ trace_array_create_systems(const char *name, const char *systems) raw_spin_lock_init(&tr->start_lock); tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - +#ifdef CONFIG_TRACER_MAX_TRACE + spin_lock_init(&tr->snapshot_trigger_lock); +#endif tr->current_trace = &nop_trace; INIT_LIST_HEAD(&tr->systems); @@ -9697,6 +9409,10 @@ trace_array_create_systems(const char *name, const char *systems) INIT_LIST_HEAD(&tr->hist_vars); INIT_LIST_HEAD(&tr->err_log); +#ifdef CONFIG_MODULES + INIT_LIST_HEAD(&tr->mod_events); +#endif + if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; @@ -9737,7 +9453,7 @@ trace_array_create_systems(const char *name, const char *systems) static struct trace_array *trace_array_create(const char *name) { - return trace_array_create_systems(name, NULL); + return trace_array_create_systems(name, NULL, 0, 0); } static int instance_mkdir(const char *name) @@ -9745,23 +9461,45 @@ static int instance_mkdir(const char *name) struct trace_array *tr; int ret; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); ret = -EEXIST; if (trace_array_find(name)) - goto out_unlock; + return -EEXIST; tr = trace_array_create(name); ret = PTR_ERR_OR_ZERO(tr); -out_unlock: - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); return ret; } +static u64 map_pages(u64 start, u64 size) +{ + struct page **pages; + phys_addr_t page_start; + unsigned int page_count; + unsigned int i; + void *vaddr; + + page_count = DIV_ROUND_UP(size, PAGE_SIZE); + + page_start = start; + pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL); + if (!pages) + return 0; + + for (i = 0; i < page_count; i++) { + phys_addr_t addr = page_start + i * PAGE_SIZE; + pages[i] = pfn_to_page(addr >> PAGE_SHIFT); + } + vaddr = vmap(pages, page_count, VM_MAP, PAGE_KERNEL); + kfree(pages); + + return (u64)(unsigned long)vaddr; +} + /** * trace_array_get_by_name - Create/Lookup a trace array, given its name. * @name: The name of the trace array to be looked up/created. @@ -9783,24 +9521,23 @@ struct trace_array *trace_array_get_by_name(const char *name, const char *system { struct trace_array *tr; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (tr->name && strcmp(tr->name, name) == 0) - goto out_unlock; + if (tr->name && strcmp(tr->name, name) == 0) { + tr->ref++; + return tr; + } } - tr = trace_array_create_systems(name, systems); + tr = trace_array_create_systems(name, systems, 0, 0); if (IS_ERR(tr)) tr = NULL; -out_unlock: - if (tr) + else tr->ref++; - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); return tr; } EXPORT_SYMBOL_GPL(trace_array_get_by_name); @@ -9821,6 +9558,9 @@ static int __remove_instance(struct trace_array *tr) set_tracer_flag(tr, 1 << i, 0); } + if (printk_trace == tr) + update_printk_trace(&global_trace); + tracing_set_nop(tr); clear_ftrace_function_probes(tr); event_trace_del_tracer(tr); @@ -9848,48 +9588,36 @@ static int __remove_instance(struct trace_array *tr) int trace_array_destroy(struct trace_array *this_tr) { struct trace_array *tr; - int ret; if (!this_tr) return -EINVAL; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); - ret = -ENODEV; /* Making sure trace array exists before destroying it. */ list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (tr == this_tr) { - ret = __remove_instance(tr); - break; - } + if (tr == this_tr) + return __remove_instance(tr); } - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); - - return ret; + return -ENODEV; } EXPORT_SYMBOL_GPL(trace_array_destroy); static int instance_rmdir(const char *name) { struct trace_array *tr; - int ret; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); - ret = -ENODEV; tr = trace_array_find(name); - if (tr) - ret = __remove_instance(tr); - - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); + if (!tr) + return -ENODEV; - return ret; + return __remove_instance(tr); } static __init void create_trace_instances(struct dentry *d_tracer) @@ -9902,19 +9630,16 @@ static __init void create_trace_instances(struct dentry *d_tracer) if (MEM_FAIL(!trace_instance_dir, "Failed to create instances directory\n")) return; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (!tr->name) continue; if (MEM_FAIL(trace_array_create_dir(tr) < 0, "Failed to create instance directory\n")) - break; + return; } - - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); } static void @@ -9983,10 +9708,15 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) if (ftrace_create_function_files(tr, d_tracer)) MEM_FAIL(1, "Could not allocate function filter files"); + if (tr->range_addr_start) { + trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer, + tr, &last_boot_fops); #ifdef CONFIG_TRACER_SNAPSHOT - trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer, - tr, &snapshot_fops); + } else { + trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer, + tr, &snapshot_fops); #endif + } trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer, tr, &tracing_err_log_fops); @@ -10099,6 +9829,24 @@ late_initcall_sync(trace_eval_sync); #ifdef CONFIG_MODULES + +bool module_exists(const char *module) +{ + /* All modules have the symbol __this_module */ + static const char this_mod[] = "__this_module"; + char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2]; + unsigned long val; + int n; + + n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod); + + if (n > sizeof(modname) - 1) + return false; + + val = module_kallsyms_lookup_name(modname); + return val != 0; +} + static void trace_module_add_evals(struct module *mod) { if (!mod->num_trace_evals) @@ -10123,7 +9871,7 @@ static void trace_module_remove_evals(struct module *mod) if (!mod->num_trace_evals) return; - mutex_lock(&trace_eval_mutex); + guard(mutex)(&trace_eval_mutex); map = trace_eval_maps; @@ -10135,12 +9883,10 @@ static void trace_module_remove_evals(struct module *mod) map = map->tail.next; } if (!map) - goto out; + return; *last = trace_eval_jmp_to_tail(map)->tail.next; kfree(map); - out: - mutex_unlock(&trace_eval_mutex); } #else static inline void trace_module_remove_evals(struct module *mod) { } @@ -10254,14 +10000,14 @@ static struct notifier_block trace_die_notifier = { static int trace_die_panic_handler(struct notifier_block *self, unsigned long ev, void *unused) { - if (!ftrace_dump_on_oops) + if (!ftrace_dump_on_oops_enabled()) return NOTIFY_DONE; /* The die notifier requires DIE_OOPS to trigger */ if (self == &trace_die_notifier && ev != DIE_OOPS) return NOTIFY_DONE; - ftrace_dump(ftrace_dump_on_oops); + ftrace_dump(DUMP_PARAM); return NOTIFY_DONE; } @@ -10302,12 +10048,12 @@ trace_printk_seq(struct trace_seq *s) trace_seq_init(s); } -void trace_init_global_iter(struct trace_iterator *iter) +static void trace_init_iter(struct trace_iterator *iter, struct trace_array *tr) { - iter->tr = &global_trace; + iter->tr = tr; iter->trace = iter->tr->current_trace; iter->cpu_file = RING_BUFFER_ALL_CPUS; - iter->array_buffer = &global_trace.array_buffer; + iter->array_buffer = &tr->array_buffer; if (iter->trace && iter->trace->open) iter->trace->open(iter); @@ -10327,22 +10073,19 @@ void trace_init_global_iter(struct trace_iterator *iter) iter->fmt_size = STATIC_FMT_BUF_SIZE; } -void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) +void trace_init_global_iter(struct trace_iterator *iter) +{ + trace_init_iter(iter, &global_trace); +} + +static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_mode) { /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; - static atomic_t dump_running; - struct trace_array *tr = &global_trace; unsigned int old_userobj; unsigned long flags; int cnt = 0, cpu; - /* Only allow one dump user at a time. */ - if (atomic_inc_return(&dump_running) != 1) { - atomic_dec(&dump_running); - return; - } - /* * Always turn off tracing when we dump. * We don't need to show trace output of what happens @@ -10351,12 +10094,12 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) * If the user does a sysrq-z, then they can re-enable * tracing with echo 1 > tracing_on. */ - tracing_off(); + tracer_tracing_off(tr); local_irq_save(flags); /* Simulate the iterator */ - trace_init_global_iter(&iter); + trace_init_iter(&iter, tr); for_each_tracing_cpu(cpu) { atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); @@ -10367,21 +10110,15 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) /* don't look at user memory in panic mode */ tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; - switch (oops_dump_mode) { - case DUMP_ALL: - iter.cpu_file = RING_BUFFER_ALL_CPUS; - break; - case DUMP_ORIG: + if (dump_mode == DUMP_ORIG) iter.cpu_file = raw_smp_processor_id(); - break; - case DUMP_NONE: - goto out_enable; - default: - printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); + else iter.cpu_file = RING_BUFFER_ALL_CPUS; - } - printk(KERN_TRACE "Dumping ftrace buffer:\n"); + if (tr == &global_trace) + printk(KERN_TRACE "Dumping ftrace buffer:\n"); + else + printk(KERN_TRACE "Dumping ftrace instance %s buffer:\n", tr->name); /* Did function tracer already get disabled? */ if (ftrace_is_dead()) { @@ -10423,15 +10160,84 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) else printk(KERN_TRACE "---------------------------------\n"); - out_enable: tr->trace_flags |= old_userobj; for_each_tracing_cpu(cpu) { atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); } - atomic_dec(&dump_running); local_irq_restore(flags); } + +static void ftrace_dump_by_param(void) +{ + bool first_param = true; + char dump_param[MAX_TRACER_SIZE]; + char *buf, *token, *inst_name; + struct trace_array *tr; + + strscpy(dump_param, ftrace_dump_on_oops, MAX_TRACER_SIZE); + buf = dump_param; + + while ((token = strsep(&buf, ",")) != NULL) { + if (first_param) { + first_param = false; + if (!strcmp("0", token)) + continue; + else if (!strcmp("1", token)) { + ftrace_dump_one(&global_trace, DUMP_ALL); + continue; + } + else if (!strcmp("2", token) || + !strcmp("orig_cpu", token)) { + ftrace_dump_one(&global_trace, DUMP_ORIG); + continue; + } + } + + inst_name = strsep(&token, "="); + tr = trace_array_find(inst_name); + if (!tr) { + printk(KERN_TRACE "Instance %s not found\n", inst_name); + continue; + } + + if (token && (!strcmp("2", token) || + !strcmp("orig_cpu", token))) + ftrace_dump_one(tr, DUMP_ORIG); + else + ftrace_dump_one(tr, DUMP_ALL); + } +} + +void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) +{ + static atomic_t dump_running; + + /* Only allow one dump user at a time. */ + if (atomic_inc_return(&dump_running) != 1) { + atomic_dec(&dump_running); + return; + } + + switch (oops_dump_mode) { + case DUMP_ALL: + ftrace_dump_one(&global_trace, DUMP_ALL); + break; + case DUMP_ORIG: + ftrace_dump_one(&global_trace, DUMP_ORIG); + break; + case DUMP_PARAM: + ftrace_dump_by_param(); + break; + case DUMP_NONE: + break; + default: + printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); + ftrace_dump_one(&global_trace, DUMP_ALL); + } + + atomic_dec(&dump_running); +} EXPORT_SYMBOL_GPL(ftrace_dump); #define WRITE_BUFSIZE 4096 @@ -10546,6 +10352,7 @@ __init static void enable_instances(void) { struct trace_array *tr; char *curr_str; + char *name; char *str; char *tok; @@ -10554,19 +10361,107 @@ __init static void enable_instances(void) str = boot_instance_info; while ((curr_str = strsep(&str, "\t"))) { + phys_addr_t start = 0; + phys_addr_t size = 0; + unsigned long addr = 0; + bool traceprintk = false; + bool traceoff = false; + char *flag_delim; + char *addr_delim; tok = strsep(&curr_str, ","); - if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE)) - do_allocate_snapshot(tok); + flag_delim = strchr(tok, '^'); + addr_delim = strchr(tok, '@'); - tr = trace_array_get_by_name(tok, NULL); - if (!tr) { - pr_warn("Failed to create instance buffer %s\n", curr_str); + if (addr_delim) + *addr_delim++ = '\0'; + + if (flag_delim) + *flag_delim++ = '\0'; + + name = tok; + + if (flag_delim) { + char *flag; + + while ((flag = strsep(&flag_delim, "^"))) { + if (strcmp(flag, "traceoff") == 0) { + traceoff = true; + } else if ((strcmp(flag, "printk") == 0) || + (strcmp(flag, "traceprintk") == 0) || + (strcmp(flag, "trace_printk") == 0)) { + traceprintk = true; + } else { + pr_info("Tracing: Invalid instance flag '%s' for %s\n", + flag, name); + } + } + } + + tok = addr_delim; + if (tok && isdigit(*tok)) { + start = memparse(tok, &tok); + if (!start) { + pr_warn("Tracing: Invalid boot instance address for %s\n", + name); + continue; + } + if (*tok != ':') { + pr_warn("Tracing: No size specified for instance %s\n", name); + continue; + } + tok++; + size = memparse(tok, &tok); + if (!size) { + pr_warn("Tracing: Invalid boot instance size for %s\n", + name); + continue; + } + } else if (tok) { + if (!reserve_mem_find_by_name(tok, &start, &size)) { + start = 0; + pr_warn("Failed to map boot instance %s to %s\n", name, tok); + continue; + } + } + + if (start) { + addr = map_pages(start, size); + if (addr) { + pr_info("Tracing: mapped boot instance %s at physical memory %pa of size 0x%lx\n", + name, &start, (unsigned long)size); + } else { + pr_warn("Tracing: Failed to map boot instance %s\n", name); + continue; + } + } else { + /* Only non mapped buffers have snapshot buffers */ + if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE)) + do_allocate_snapshot(name); + } + + tr = trace_array_create_systems(name, NULL, addr, size); + if (IS_ERR(tr)) { + pr_warn("Tracing: Failed to create instance buffer %s\n", curr_str); continue; } - /* Allow user space to delete it */ - trace_array_put(tr); + + if (traceoff) + tracer_tracing_off(tr); + + if (traceprintk) + update_printk_trace(tr); + + /* + * If start is set, then this is a mapped buffer, and + * cannot be deleted by user space, so keep the reference + * to it. + */ + if (start) { + tr->flags |= TRACE_ARRAY_FL_BOOT; + tr->ref++; + } while ((tok = strsep(&curr_str, ","))) { early_enable_events(tr, tok, true); @@ -10659,9 +10554,15 @@ __init static int tracer_alloc_buffers(void) global_trace.current_trace = &nop_trace; global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - +#ifdef CONFIG_TRACER_MAX_TRACE + spin_lock_init(&global_trace.snapshot_trigger_lock); +#endif ftrace_init_global_array_ops(&global_trace); +#ifdef CONFIG_MODULES + INIT_LIST_HEAD(&global_trace.mod_events); +#endif + init_trace_flags_index(&global_trace); register_tracer(&nop_trace); @@ -10689,14 +10590,12 @@ __init static int tracer_alloc_buffers(void) register_snapshot_cmd(); - test_can_verify(); - return 0; out_free_pipe_cpumask: free_cpumask_var(global_trace.pipe_cpumask); out_free_savedcmd: - free_saved_cmdlines_buffer(savedcmd); + trace_free_saved_cmdlines_buffer(); out_free_temp_buffer: ring_buffer_free(temp_buffer); out_rm_hp_state: @@ -10709,6 +10608,14 @@ out: return ret; } +#ifdef CONFIG_FUNCTION_TRACER +/* Used to set module cached ftrace filtering at boot up */ +__init struct trace_array *trace_get_global_array(void) +{ + return &global_trace; +} +#endif + void __init ftrace_boot_snapshot(void) { #ifdef CONFIG_TRACER_MAX_TRACE diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 00f873910c5d..9c21ba45b7af 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -46,6 +46,7 @@ enum trace_type { TRACE_BRANCH, TRACE_GRAPH_RET, TRACE_GRAPH_ENT, + TRACE_GRAPH_RETADDR_ENT, TRACE_USER_STACK, TRACE_BLK, TRACE_BPUTS, @@ -334,8 +335,8 @@ struct trace_array { */ struct array_buffer max_buffer; bool allocated_snapshot; -#endif -#ifdef CONFIG_TRACER_MAX_TRACE + spinlock_t snapshot_trigger_lock; + unsigned int snapshot; unsigned long max_latency; #ifdef CONFIG_FSNOTIFY struct dentry *d_max_latency; @@ -343,6 +344,13 @@ struct trace_array { struct irq_work fsnotify_irqwork; #endif #endif + /* The below is for memory mapped ring buffer */ + unsigned int mapped; + unsigned long range_addr_start; + unsigned long range_addr_size; + long text_delta; + long data_delta; + struct trace_pid_list __rcu *filtered_pids; struct trace_pid_list __rcu *filtered_no_pids; /* @@ -392,10 +400,16 @@ struct trace_array { cpumask_var_t pipe_cpumask; int ref; int trace_ref; +#ifdef CONFIG_MODULES + struct list_head mod_events; +#endif #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops *ops; struct trace_pid_list __rcu *function_pids; struct trace_pid_list __rcu *function_no_pids; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + struct fgraph_ops *gops; +#endif #ifdef CONFIG_DYNAMIC_FTRACE /* All of these are protected by the ftrace_lock */ struct list_head func_probes; @@ -419,9 +433,20 @@ struct trace_array { }; enum { - TRACE_ARRAY_FL_GLOBAL = (1 << 0) + TRACE_ARRAY_FL_GLOBAL = BIT(0), + TRACE_ARRAY_FL_BOOT = BIT(1), + TRACE_ARRAY_FL_MOD_INIT = BIT(2), }; +#ifdef CONFIG_MODULES +bool module_exists(const char *module); +#else +static inline bool module_exists(const char *module) +{ + return false; +} +#endif + extern struct list_head ftrace_trace_arrays; extern struct mutex trace_types_lock; @@ -501,6 +526,8 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ TRACE_GRAPH_ENT); \ + IF_ASSIGN(var, ent, struct fgraph_retaddr_ent_entry,\ + TRACE_GRAPH_RETADDR_ENT); \ IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ IF_ASSIGN(var, ent, struct func_repeats_entry, \ @@ -640,6 +667,8 @@ trace_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long len, unsigned int trace_ctx); +int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, int cpu); + struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); @@ -651,9 +680,8 @@ void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, bool trace_is_tracepoint_string(const char *str); const char *trace_event_format(struct trace_iterator *iter, const char *fmt); -void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, - va_list ap) __printf(2, 0); char *trace_iter_expand_format(struct trace_iterator *iter); +bool ignore_event(struct trace_iterator *iter); int trace_empty(struct trace_iterator *iter); @@ -678,9 +706,10 @@ void trace_latency_header(struct seq_file *m); void trace_default_header(struct seq_file *m); void print_trace_header(struct seq_file *m, struct trace_iterator *iter); -void trace_graph_return(struct ftrace_graph_ret *trace); -int trace_graph_entry(struct ftrace_graph_ent *trace); -void set_graph_array(struct trace_array *tr); +void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops, + struct ftrace_regs *fregs); +int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, + struct ftrace_regs *fregs); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); @@ -703,8 +732,6 @@ extern unsigned long tracing_thresh; /* PID filtering */ -extern int pid_max; - bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid); bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, @@ -760,6 +787,8 @@ extern void trace_event_follow_fork(struct trace_array *tr, bool enable); extern unsigned long ftrace_update_tot_cnt; extern unsigned long ftrace_number_of_pages; extern unsigned long ftrace_number_of_groups; +extern u64 ftrace_update_time; +extern u64 ftrace_total_mod_time; void ftrace_init_trace_array(struct trace_array *tr); #else static inline void ftrace_init_trace_array(struct trace_array *tr) { } @@ -867,6 +896,7 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) #define TRACE_GRAPH_GRAPH_TIME 0x400 #define TRACE_GRAPH_PRINT_RETVAL 0x800 #define TRACE_GRAPH_PRINT_RETVAL_HEX 0x1000 +#define TRACE_GRAPH_PRINT_RETADDR 0x2000 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) @@ -888,15 +918,68 @@ extern void graph_trace_close(struct trace_iterator *iter); extern int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned int trace_ctx); +extern int __trace_graph_retaddr_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned int trace_ctx, + unsigned long retaddr); extern void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, - unsigned int trace_ctx); + unsigned int trace_ctx, + u64 calltime, u64 rettime); + +extern void init_array_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops); +extern int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops); +extern void free_fgraph_ops(struct trace_array *tr); + +enum { + TRACE_GRAPH_FL = 1, + + /* + * In the very unlikely case that an interrupt came in + * at a start of graph tracing, and we want to trace + * the function in that interrupt, the depth can be greater + * than zero, because of the preempted start of a previous + * trace. In an even more unlikely case, depth could be 2 + * if a softirq interrupted the start of graph tracing, + * followed by an interrupt preempting a start of graph + * tracing in the softirq, and depth can even be 3 + * if an NMI came in at the start of an interrupt function + * that preempted a softirq start of a function that + * preempted normal context!!!! Luckily, it can't be + * greater than 3, so the next two bits are a mask + * of what the depth is when we set TRACE_GRAPH_FL + */ + + TRACE_GRAPH_DEPTH_START_BIT, + TRACE_GRAPH_DEPTH_END_BIT, + + /* + * To implement set_graph_notrace, if this bit is set, we ignore + * function graph tracing of called functions, until the return + * function is called to clear it. + */ + TRACE_GRAPH_NOTRACE_BIT, +}; + +#define TRACE_GRAPH_NOTRACE (1 << TRACE_GRAPH_NOTRACE_BIT) + +static inline unsigned long ftrace_graph_depth(unsigned long *task_var) +{ + return (*task_var >> TRACE_GRAPH_DEPTH_START_BIT) & 3; +} + +static inline void ftrace_graph_set_depth(unsigned long *task_var, int depth) +{ + *task_var &= ~(3 << TRACE_GRAPH_DEPTH_START_BIT); + *task_var |= (depth & 3) << TRACE_GRAPH_DEPTH_START_BIT; +} #ifdef CONFIG_DYNAMIC_FTRACE extern struct ftrace_hash __rcu *ftrace_graph_hash; extern struct ftrace_hash __rcu *ftrace_graph_notrace_hash; -static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace) +static inline int +ftrace_graph_addr(unsigned long *task_var, struct ftrace_graph_ent *trace) { unsigned long addr = trace->func; int ret = 0; @@ -918,13 +1001,12 @@ static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace) } if (ftrace_lookup_ip(hash, addr)) { - /* * This needs to be cleared on the return functions * when the depth is zero. */ - trace_recursion_set(TRACE_GRAPH_BIT); - trace_recursion_set_depth(trace->depth); + *task_var |= TRACE_GRAPH_FL; + ftrace_graph_set_depth(task_var, trace->depth); /* * If no irqs are to be traced, but a set_graph_function @@ -943,11 +1025,14 @@ out: return ret; } -static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace) +static inline void +ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftrace_graph_ret *trace) { - if (trace_recursion_test(TRACE_GRAPH_BIT) && - trace->depth == trace_recursion_depth()) - trace_recursion_clear(TRACE_GRAPH_BIT); + unsigned long *task_var = fgraph_get_task_var(gops); + + if ((*task_var & TRACE_GRAPH_FL) && + trace->depth == ftrace_graph_depth(task_var)) + *task_var &= ~TRACE_GRAPH_FL; } static inline int ftrace_graph_notrace_addr(unsigned long addr) @@ -973,7 +1058,7 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr) return ret; } #else -static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace) +static inline int ftrace_graph_addr(unsigned long *task_var, struct ftrace_graph_ent *trace) { return 1; } @@ -982,27 +1067,38 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr) { return 0; } -static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace) +static inline void ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftrace_graph_ret *trace) { } #endif /* CONFIG_DYNAMIC_FTRACE */ extern unsigned int fgraph_max_depth; +extern bool fgraph_sleep_time; -static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace) +static inline bool +ftrace_graph_ignore_func(struct fgraph_ops *gops, struct ftrace_graph_ent *trace) { + unsigned long *task_var = fgraph_get_task_var(gops); + /* trace it when it is-nested-in or is a function enabled. */ - return !(trace_recursion_test(TRACE_GRAPH_BIT) || - ftrace_graph_addr(trace)) || + return !((*task_var & TRACE_GRAPH_FL) || + ftrace_graph_addr(task_var, trace)) || (trace->depth < 0) || (fgraph_max_depth && trace->depth >= fgraph_max_depth); } +void fgraph_init_ops(struct ftrace_ops *dst_ops, + struct ftrace_ops *src_ops); + #else /* CONFIG_FUNCTION_GRAPH_TRACER */ static inline enum print_line_t print_graph_function_flags(struct trace_iterator *iter, u32 flags) { return TRACE_TYPE_UNHANDLED; } +static inline void free_fgraph_ops(struct trace_array *tr) { } +/* ftrace_ops may not be defined */ +#define init_array_fgraph_ops(tr, ops) do { } while (0) +#define allocate_fgraph_ops(tr, ops) ({ 0; }) #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ extern struct list_head ftrace_pids; @@ -1033,6 +1129,7 @@ void ftrace_destroy_function_files(struct trace_array *tr); int ftrace_allocate_ftrace_ops(struct trace_array *tr); void ftrace_free_ftrace_ops(struct trace_array *tr); void ftrace_init_global_array_ops(struct trace_array *tr); +struct trace_array *trace_get_global_array(void); void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); void ftrace_reset_array_ops(struct trace_array *tr); void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); @@ -1250,6 +1347,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(IRQ_INFO, "irq-info"), \ C(MARKERS, "markers"), \ C(EVENT_FORK, "event-fork"), \ + C(TRACE_PRINTK, "trace_printk_dest"), \ C(PAUSE_ON_TRACE, "pause-on-trace"), \ C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \ FUNCTION_FLAGS \ @@ -1330,7 +1428,8 @@ struct ftrace_event_field { int filter_type; int offset; int size; - int is_signed; + unsigned int is_signed:1; + unsigned int needs_test:1; int len; }; @@ -1357,10 +1456,6 @@ struct trace_subsystem_dir { int nr_events; }; -extern int call_filter_check_discard(struct trace_event_call *call, void *rec, - struct trace_buffer *buffer, - struct ring_buffer_event *event); - void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct trace_buffer *buffer, struct ring_buffer_event *event, @@ -1375,6 +1470,16 @@ static inline void trace_buffer_unlock_commit(struct trace_array *tr, trace_buffer_unlock_commit_regs(tr, buffer, event, trace_ctx, NULL); } +DECLARE_PER_CPU(bool, trace_taskinfo_save); +int trace_save_cmdline(struct task_struct *tsk); +int trace_create_savedcmd(void); +int trace_alloc_tgid_map(void); +void trace_free_saved_cmdlines_buffer(void); + +extern const struct file_operations tracing_saved_cmdlines_fops; +extern const struct file_operations tracing_saved_tgids_fops; +extern const struct file_operations tracing_saved_cmdlines_size_fops; + DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); DECLARE_PER_CPU(int, trace_buffered_event_cnt); void trace_buffered_event_disable(void); @@ -1562,6 +1667,29 @@ static inline void *event_file_data(struct file *filp) extern struct mutex event_mutex; extern struct list_head ftrace_events; +/* + * When the trace_event_file is the filp->i_private pointer, + * it must be taken under the event_mutex lock, and then checked + * if the EVENT_FILE_FL_FREED flag is set. If it is, then the + * data pointed to by the trace_event_file can not be trusted. + * + * Use the event_file_file() to access the trace_event_file from + * the filp the first time under the event_mutex and check for + * NULL. If it is needed to be retrieved again and the event_mutex + * is still held, then the event_file_data() can be used and it + * is guaranteed to be valid. + */ +static inline struct trace_event_file *event_file_file(struct file *filp) +{ + struct trace_event_file *file; + + lockdep_assert_held(&event_mutex); + file = READ_ONCE(file_inode(filp)->i_private); + if (!file || file->flags & EVENT_FILE_FL_FREED) + return NULL; + return file; +} + extern const struct file_operations event_trigger_fops; extern const struct file_operations event_hist_fops; extern const struct file_operations event_hist_debug_fops; @@ -1973,12 +2101,16 @@ static inline void trace_event_eval_update(struct trace_eval_map **map, int len) #ifdef CONFIG_TRACER_SNAPSHOT void tracing_snapshot_instance(struct trace_array *tr); int tracing_alloc_snapshot_instance(struct trace_array *tr); +int tracing_arm_snapshot(struct trace_array *tr); +void tracing_disarm_snapshot(struct trace_array *tr); #else static inline void tracing_snapshot_instance(struct trace_array *tr) { } static inline int tracing_alloc_snapshot_instance(struct trace_array *tr) { return 0; } +static inline int tracing_arm_snapshot(struct trace_array *tr) { return 0; } +static inline void tracing_disarm_snapshot(struct trace_array *tr) { } #endif #ifdef CONFIG_PREEMPT_TRACER @@ -2067,4 +2199,11 @@ static inline int rv_init_interface(void) } #endif +/* + * This is used only to distinguish + * function address from trampoline code. + * So this value has no meaning. + */ +#define FTRACE_TRAMPOLINE_MARKER ((unsigned long) INT_MAX) + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 54d5fa35c90a..e19c32f2a938 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -92,7 +92,6 @@ static void trace_do_benchmark(void) bm_total += delta; bm_totalsq += delta * delta; - if (bm_cnt > 1) { /* * Apply Welford's method to calculate standard deviation: @@ -105,7 +104,7 @@ static void trace_do_benchmark(void) stddev = 0; delta = bm_total; - do_div(delta, bm_cnt); + do_div(delta, (u32)bm_cnt); avg = delta; if (stddev > 0) { @@ -127,7 +126,7 @@ static void trace_do_benchmark(void) seed = stddev; if (!last_seed) break; - do_div(seed, last_seed); + seed = div64_u64(seed, last_seed); seed += last_seed; do_div(seed, 2); } while (i++ < 10 && last_seed != seed); diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index e47fdb4c92fb..6d08a5523ce0 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -30,7 +30,6 @@ static struct trace_array *branch_tracer; static void probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) { - struct trace_event_call *call = &event_branch; struct trace_array *tr = branch_tracer; struct trace_buffer *buffer; struct trace_array_cpu *data; @@ -74,16 +73,13 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) p--; p++; - strncpy(entry->func, f->data.func, TRACE_FUNC_SIZE); - strncpy(entry->file, p, TRACE_FILE_SIZE); - entry->func[TRACE_FUNC_SIZE] = 0; - entry->file[TRACE_FILE_SIZE] = 0; + strscpy(entry->func, f->data.func); + strscpy(entry->file, p); entry->constant = f->constant; entry->line = f->data.line; entry->correct = val == expect; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); out: current->trace_recursion &= ~TRACE_BRANCH_BIT; diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 4702efb00ff2..4cb2ebc439be 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -154,5 +154,5 @@ static atomic64_t trace_counter; */ u64 notrace trace_clock_counter(void) { - return atomic64_add_return(1, &trace_counter); + return atomic64_inc_return(&trace_counter); } diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 4376887e0d8a..a322e4f249a5 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -74,24 +74,19 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type struct dyn_event *pos, *n; char *system = NULL, *event, *p; int argc, ret = -ENOENT; - char **argv; + char **argv __free(argv_free) = argv_split(GFP_KERNEL, raw_command, &argc); - argv = argv_split(GFP_KERNEL, raw_command, &argc); if (!argv) return -ENOMEM; if (argv[0][0] == '-') { - if (argv[0][1] != ':') { - ret = -EINVAL; - goto out; - } + if (argv[0][1] != ':') + return -EINVAL; event = &argv[0][2]; } else { event = strchr(argv[0], ':'); - if (!event) { - ret = -EINVAL; - goto out; - } + if (!event) + return -EINVAL; event++; } @@ -101,10 +96,8 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type event = p + 1; *p = '\0'; } - if (!system && event[0] == '\0') { - ret = -EINVAL; - goto out; - } + if (!system && event[0] == '\0') + return -EINVAL; mutex_lock(&event_mutex); for_each_dyn_event_safe(pos, n) { @@ -120,8 +113,6 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type } tracing_reset_all_online_cpus(); mutex_unlock(&event_mutex); -out: - argv_free(argv); return ret; } diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index c47422b20908..fbfb396905a6 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -85,9 +85,35 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth) ); -/* Function return entry */ +#ifdef CONFIG_FUNCTION_GRAPH_RETADDR + +/* Function call entry with a return address */ +FTRACE_ENTRY_PACKED(fgraph_retaddr_entry, fgraph_retaddr_ent_entry, + + TRACE_GRAPH_RETADDR_ENT, + + F_STRUCT( + __field_struct( struct fgraph_retaddr_ent, graph_ent ) + __field_packed( unsigned long, graph_ent, func ) + __field_packed( int, graph_ent, depth ) + __field_packed( unsigned long, graph_ent, retaddr ) + ), + + F_printk("--> %ps (%d) <- %ps", (void *)__entry->func, __entry->depth, + (void *)__entry->retaddr) +); + +#else + +#ifndef fgraph_retaddr_ent_entry +#define fgraph_retaddr_ent_entry ftrace_graph_ent_entry +#endif + +#endif + #ifdef CONFIG_FUNCTION_GRAPH_RETVAL +/* Function return entry */ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, TRACE_GRAPH_RET, @@ -98,8 +124,8 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, __field_packed( unsigned long, ret, retval ) __field_packed( int, ret, depth ) __field_packed( unsigned int, ret, overrun ) - __field_packed( unsigned long long, ret, calltime) - __field_packed( unsigned long long, ret, rettime ) + __field(unsigned long long, calltime ) + __field(unsigned long long, rettime ) ), F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d retval: %lx", @@ -110,6 +136,7 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, #else +/* Function return entry */ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, TRACE_GRAPH_RET, @@ -119,8 +146,8 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, __field_packed( unsigned long, ret, func ) __field_packed( int, ret, depth ) __field_packed( unsigned int, ret, overrun ) - __field_packed( unsigned long long, ret, calltime) - __field_packed( unsigned long long, ret, rettime ) + __field(unsigned long long, calltime ) + __field(unsigned long long, rettime ) ), F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d", diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 03c851f57969..82fd637cfc19 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -220,7 +220,7 @@ static struct trace_eprobe *alloc_event_probe(const char *group, if (!ep->event_system) goto error; - ret = trace_probe_init(&ep->tp, this_event, group, false); + ret = trace_probe_init(&ep->tp, this_event, group, false, nargs); if (ret < 0) goto error; @@ -390,8 +390,8 @@ static int get_eprobe_size(struct trace_probe *tp, void *rec) /* Note that we don't verify it, since the code does not come from user space */ static int -process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, - void *base) +process_fetch_insn(struct fetch_insn *code, void *rec, void *edata, + void *dest, void *base) { unsigned long val; int ret; @@ -438,7 +438,7 @@ __eprobe_trace_func(struct eprobe_data *edata, void *rec) return; entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); - store_trace_args(&entry[1], &edata->ep->tp, rec, sizeof(*entry), dsize); + store_trace_args(&entry[1], &edata->ep->tp, rec, NULL, sizeof(*entry), dsize); trace_event_buffer_commit(&fbuffer); } @@ -912,10 +912,15 @@ static int __trace_eprobe_create(int argc, const char *argv[]) } } - mutex_lock(&event_mutex); - event_call = find_and_get_event(sys_name, sys_event); - ep = alloc_event_probe(group, event, event_call, argc - 2); - mutex_unlock(&event_mutex); + if (argc - 2 > MAX_TRACE_ARGS) { + ret = -E2BIG; + goto error; + } + + scoped_guard(mutex, &event_mutex) { + event_call = find_and_get_event(sys_name, sys_event); + ep = alloc_event_probe(group, event, event_call, argc - 2); + } if (IS_ERR(ep)) { ret = PTR_ERR(ep); @@ -937,7 +942,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) argc -= 2; argv += 2; /* parse arguments */ - for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + for (i = 0; i < argc; i++) { trace_probe_log_set_index(i + 2); ret = trace_eprobe_tp_update_arg(ep, argv, i); if (ret) @@ -947,18 +952,21 @@ static int __trace_eprobe_create(int argc, const char *argv[]) if (ret < 0) goto error; init_trace_eprobe_call(ep); - mutex_lock(&event_mutex); - ret = trace_probe_register_event_call(&ep->tp); - if (ret) { - if (ret == -EEXIST) { - trace_probe_log_set_index(0); - trace_probe_log_err(0, EVENT_EXIST); + scoped_guard(mutex, &event_mutex) { + ret = trace_probe_register_event_call(&ep->tp); + if (ret) { + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } + goto error; + } + ret = dyn_event_add(&ep->devent, &ep->tp.event->call); + if (ret < 0) { + trace_probe_unregister_event_call(&ep->tp); + goto error; } - mutex_unlock(&event_mutex); - goto error; } - ret = dyn_event_add(&ep->devent, &ep->tp.event->call); - mutex_unlock(&event_mutex); return ret; parse_error: ret = -EINVAL; diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 05e791241812..3ff9caa4a71b 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -352,10 +352,16 @@ void perf_uprobe_destroy(struct perf_event *p_event) int perf_trace_add(struct perf_event *p_event, int flags) { struct trace_event_call *tp_event = p_event->tp_event; + struct hw_perf_event *hwc = &p_event->hw; if (!(flags & PERF_EF_START)) p_event->hw.state = PERF_HES_STOPPED; + if (is_sampling_event(p_event)) { + hwc->last_period = hwc->sample_period; + perf_swevent_set_period(p_event); + } + /* * If TRACE_REG_PERF_ADD returns false; no custom action was performed * and we need to take the default action of enqueueing our event on diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7c364b87352e..513de9ceb80e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -82,7 +82,7 @@ static int system_refcount_dec(struct event_subsystem *system) } static struct ftrace_event_field * -__find_event_field(struct list_head *head, char *name) +__find_event_field(struct list_head *head, const char *name) { struct ftrace_event_field *field; @@ -114,7 +114,8 @@ trace_find_event_field(struct trace_event_call *call, char *name) static int __trace_define_field(struct list_head *head, const char *type, const char *name, int offset, int size, - int is_signed, int filter_type, int len) + int is_signed, int filter_type, int len, + int need_test) { struct ftrace_event_field *field; @@ -133,6 +134,7 @@ static int __trace_define_field(struct list_head *head, const char *type, field->offset = offset; field->size = size; field->is_signed = is_signed; + field->needs_test = need_test; field->len = len; list_add(&field->link, head); @@ -151,13 +153,13 @@ int trace_define_field(struct trace_event_call *call, const char *type, head = trace_get_fields(call); return __trace_define_field(head, type, name, offset, size, - is_signed, filter_type, 0); + is_signed, filter_type, 0, 0); } EXPORT_SYMBOL_GPL(trace_define_field); static int trace_define_field_ext(struct trace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, - int filter_type, int len) + int filter_type, int len, int need_test) { struct list_head *head; @@ -166,13 +168,13 @@ static int trace_define_field_ext(struct trace_event_call *call, const char *typ head = trace_get_fields(call); return __trace_define_field(head, type, name, offset, size, - is_signed, filter_type, len); + is_signed, filter_type, len, need_test); } #define __generic_field(type, item, filter_type) \ ret = __trace_define_field(&ftrace_generic_fields, #type, \ #item, 0, 0, is_signed_type(type), \ - filter_type, 0); \ + filter_type, 0, 0); \ if (ret) \ return ret; @@ -181,7 +183,8 @@ static int trace_define_field_ext(struct trace_event_call *call, const char *typ "common_" #item, \ offsetof(typeof(ent), item), \ sizeof(ent.item), \ - is_signed_type(type), FILTER_OTHER, 0); \ + is_signed_type(type), FILTER_OTHER, \ + 0, 0); \ if (ret) \ return ret; @@ -244,19 +247,16 @@ int trace_event_get_offsets(struct trace_event_call *call) return tail->offset + tail->size; } -/* - * Check if the referenced field is an array and return true, - * as arrays are OK to dereference. - */ -static bool test_field(const char *fmt, struct trace_event_call *call) + +static struct trace_event_fields *find_event_field(const char *fmt, + struct trace_event_call *call) { struct trace_event_fields *field = call->class->fields_array; - const char *array_descriptor; const char *p = fmt; int len; if (!(len = str_has_prefix(fmt, "REC->"))) - return false; + return NULL; fmt += len; for (p = fmt; *p; p++) { if (!isalnum(*p) && *p != '_') @@ -265,16 +265,141 @@ static bool test_field(const char *fmt, struct trace_event_call *call) len = p - fmt; for (; field->type; field++) { - if (strncmp(field->name, fmt, len) || - field->name[len]) + if (strncmp(field->name, fmt, len) || field->name[len]) continue; - array_descriptor = strchr(field->type, '['); - /* This is an array and is OK to dereference. */ - return array_descriptor != NULL; + + return field; + } + return NULL; +} + +/* + * Check if the referenced field is an array and return true, + * as arrays are OK to dereference. + */ +static bool test_field(const char *fmt, struct trace_event_call *call) +{ + struct trace_event_fields *field; + + field = find_event_field(fmt, call); + if (!field) + return false; + + /* This is an array and is OK to dereference. */ + return strchr(field->type, '[') != NULL; +} + +/* Look for a string within an argument */ +static bool find_print_string(const char *arg, const char *str, const char *end) +{ + const char *r; + + r = strstr(arg, str); + return r && r < end; +} + +/* Return true if the argument pointer is safe */ +static bool process_pointer(const char *fmt, int len, struct trace_event_call *call) +{ + const char *r, *e, *a; + + e = fmt + len; + + /* Find the REC-> in the argument */ + r = strstr(fmt, "REC->"); + if (r && r < e) { + /* + * Addresses of events on the buffer, or an array on the buffer is + * OK to dereference. There's ways to fool this, but + * this is to catch common mistakes, not malicious code. + */ + a = strchr(fmt, '&'); + if ((a && (a < r)) || test_field(r, call)) + return true; + } else if (find_print_string(fmt, "__get_dynamic_array(", e)) { + return true; + } else if (find_print_string(fmt, "__get_rel_dynamic_array(", e)) { + return true; + } else if (find_print_string(fmt, "__get_dynamic_array_len(", e)) { + return true; + } else if (find_print_string(fmt, "__get_rel_dynamic_array_len(", e)) { + return true; + } else if (find_print_string(fmt, "__get_sockaddr(", e)) { + return true; + } else if (find_print_string(fmt, "__get_rel_sockaddr(", e)) { + return true; } return false; } +/* Return true if the string is safe */ +static bool process_string(const char *fmt, int len, struct trace_event_call *call) +{ + struct trace_event_fields *field; + const char *r, *e, *s; + + e = fmt + len; + + /* + * There are several helper functions that return strings. + * If the argument contains a function, then assume its field is valid. + * It is considered that the argument has a function if it has: + * alphanumeric or '_' before a parenthesis. + */ + s = fmt; + do { + r = strstr(s, "("); + if (!r || r >= e) + break; + for (int i = 1; r - i >= s; i++) { + char ch = *(r - i); + if (isspace(ch)) + continue; + if (isalnum(ch) || ch == '_') + return true; + /* Anything else, this isn't a function */ + break; + } + /* A function could be wrapped in parethesis, try the next one */ + s = r + 1; + } while (s < e); + + /* + * Check for arrays. If the argument has: foo[REC->val] + * then it is very likely that foo is an array of strings + * that are safe to use. + */ + r = strstr(s, "["); + if (r && r < e) { + r = strstr(r, "REC->"); + if (r && r < e) + return true; + } + + /* + * If there's any strings in the argument consider this arg OK as it + * could be: REC->field ? "foo" : "bar" and we don't want to get into + * verifying that logic here. + */ + if (find_print_string(fmt, "\"", e)) + return true; + + /* Dereferenced strings are also valid like any other pointer */ + if (process_pointer(fmt, len, call)) + return true; + + /* Make sure the field is found */ + field = find_event_field(fmt, call); + if (!field) + return false; + + /* Test this field's string before printing the event */ + call->flags |= TRACE_EVENT_FL_TEST_STR; + field->needs_test = 1; + + return true; +} + /* * Examine the print fmt of the event looking for unsafe dereference * pointers using %p* that could be recorded in the trace event and @@ -284,13 +409,14 @@ static bool test_field(const char *fmt, struct trace_event_call *call) static void test_event_printk(struct trace_event_call *call) { u64 dereference_flags = 0; + u64 string_flags = 0; bool first = true; - const char *fmt, *c, *r, *a; + const char *fmt; int parens = 0; char in_quote = 0; int start_arg = 0; int arg = 0; - int i; + int i, e; fmt = call->print_fmt; @@ -374,8 +500,16 @@ static void test_event_printk(struct trace_event_call *call) star = true; continue; } - if ((fmt[i + j] == 's') && star) - arg++; + if ((fmt[i + j] == 's')) { + if (star) + arg++; + if (WARN_ONCE(arg == 63, + "Too many args for event: %s", + trace_event_name(call))) + return; + dereference_flags |= 1ULL << arg; + string_flags |= 1ULL << arg; + } break; } break; @@ -403,42 +537,47 @@ static void test_event_printk(struct trace_event_call *call) case ',': if (in_quote || parens) continue; + e = i; i++; while (isspace(fmt[i])) i++; - start_arg = i; - if (!(dereference_flags & (1ULL << arg))) - goto next_arg; - /* Find the REC-> in the argument */ - c = strchr(fmt + i, ','); - r = strstr(fmt + i, "REC->"); - if (r && (!c || r < c)) { - /* - * Addresses of events on the buffer, - * or an array on the buffer is - * OK to dereference. - * There's ways to fool this, but - * this is to catch common mistakes, - * not malicious code. - */ - a = strchr(fmt + i, '&'); - if ((a && (a < r)) || test_field(r, call)) + /* + * If start_arg is zero, then this is the start of the + * first argument. The processing of the argument happens + * when the end of the argument is found, as it needs to + * handle paranthesis and such. + */ + if (!start_arg) { + start_arg = i; + /* Balance out the i++ in the for loop */ + i--; + continue; + } + + if (dereference_flags & (1ULL << arg)) { + if (string_flags & (1ULL << arg)) { + if (process_string(fmt + start_arg, e - start_arg, call)) + dereference_flags &= ~(1ULL << arg); + } else if (process_pointer(fmt + start_arg, e - start_arg, call)) dereference_flags &= ~(1ULL << arg); - } else if ((r = strstr(fmt + i, "__get_dynamic_array(")) && - (!c || r < c)) { - dereference_flags &= ~(1ULL << arg); - } else if ((r = strstr(fmt + i, "__get_sockaddr(")) && - (!c || r < c)) { - dereference_flags &= ~(1ULL << arg); } - next_arg: - i--; + start_arg = i; arg++; + /* Balance out the i++ in the for loop */ + i--; } } + if (dereference_flags & (1ULL << arg)) { + if (string_flags & (1ULL << arg)) { + if (process_string(fmt + start_arg, i - start_arg, call)) + dereference_flags &= ~(1ULL << arg); + } else if (process_pointer(fmt + start_arg, i - start_arg, call)) + dereference_flags &= ~(1ULL << arg); + } + /* * If you triggered the below warning, the trace event reported * uses an unsafe dereference pointer %p*. As the data stored @@ -730,6 +869,120 @@ static int ftrace_event_enable_disable(struct trace_event_file *file, return __ftrace_event_enable_disable(file, enable, 0); } +#ifdef CONFIG_MODULES +struct event_mod_load { + struct list_head list; + char *module; + char *match; + char *system; + char *event; +}; + +static void free_event_mod(struct event_mod_load *event_mod) +{ + list_del(&event_mod->list); + kfree(event_mod->module); + kfree(event_mod->match); + kfree(event_mod->system); + kfree(event_mod->event); + kfree(event_mod); +} + +static void clear_mod_events(struct trace_array *tr) +{ + struct event_mod_load *event_mod, *n; + + list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) { + free_event_mod(event_mod); + } +} + +static int remove_cache_mod(struct trace_array *tr, const char *mod, + const char *match, const char *system, const char *event) +{ + struct event_mod_load *event_mod, *n; + int ret = -EINVAL; + + list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) { + if (strcmp(event_mod->module, mod) != 0) + continue; + + if (match && strcmp(event_mod->match, match) != 0) + continue; + + if (system && + (!event_mod->system || strcmp(event_mod->system, system) != 0)) + continue; + + if (event && + (!event_mod->event || strcmp(event_mod->event, event) != 0)) + continue; + + free_event_mod(event_mod); + ret = 0; + } + + return ret; +} + +static int cache_mod(struct trace_array *tr, const char *mod, int set, + const char *match, const char *system, const char *event) +{ + struct event_mod_load *event_mod; + + /* If the module exists, then this just failed to find an event */ + if (module_exists(mod)) + return -EINVAL; + + /* See if this is to remove a cached filter */ + if (!set) + return remove_cache_mod(tr, mod, match, system, event); + + event_mod = kzalloc(sizeof(*event_mod), GFP_KERNEL); + if (!event_mod) + return -ENOMEM; + + INIT_LIST_HEAD(&event_mod->list); + event_mod->module = kstrdup(mod, GFP_KERNEL); + if (!event_mod->module) + goto out_free; + + if (match) { + event_mod->match = kstrdup(match, GFP_KERNEL); + if (!event_mod->match) + goto out_free; + } + + if (system) { + event_mod->system = kstrdup(system, GFP_KERNEL); + if (!event_mod->system) + goto out_free; + } + + if (event) { + event_mod->event = kstrdup(event, GFP_KERNEL); + if (!event_mod->event) + goto out_free; + } + + list_add(&event_mod->list, &tr->mod_events); + + return 0; + + out_free: + free_event_mod(event_mod); + + return -ENOMEM; +} +#else /* CONFIG_MODULES */ +static inline void clear_mod_events(struct trace_array *tr) { } +static int cache_mod(struct trace_array *tr, const char *mod, int set, + const char *match, const char *system, const char *event) +{ + return -EINVAL; +} +#endif + static void ftrace_clear_events(struct trace_array *tr) { struct trace_event_file *file; @@ -738,6 +991,7 @@ static void ftrace_clear_events(struct trace_array *tr) list_for_each_entry(file, &tr->events, list) { ftrace_event_enable_disable(file, 0); } + clear_mod_events(tr); mutex_unlock(&event_mutex); } @@ -992,18 +1246,18 @@ static void remove_subsystem(struct trace_subsystem_dir *dir) void event_file_get(struct trace_event_file *file) { - atomic_inc(&file->ref); + refcount_inc(&file->ref); } void event_file_put(struct trace_event_file *file) { - if (WARN_ON_ONCE(!atomic_read(&file->ref))) { + if (WARN_ON_ONCE(!refcount_read(&file->ref))) { if (file->flags & EVENT_FILE_FL_FREED) kmem_cache_free(file_cachep, file); return; } - if (atomic_dec_and_test(&file->ref)) { + if (refcount_dec_and_test(&file->ref)) { /* Count should only go to zero when it is freed */ if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED))) return; @@ -1026,17 +1280,36 @@ static void remove_event_file_dir(struct trace_event_file *file) */ static int __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, - const char *sub, const char *event, int set) + const char *sub, const char *event, int set, + const char *mod) { struct trace_event_file *file; struct trace_event_call *call; + char *module __free(kfree) = NULL; const char *name; int ret = -EINVAL; int eret = 0; + if (mod) { + char *p; + + module = kstrdup(mod, GFP_KERNEL); + if (!module) + return -ENOMEM; + + /* Replace all '-' with '_' as that's what modules do */ + for (p = strchr(module, '-'); p; p = strchr(p + 1, '-')) + *p = '_'; + } + list_for_each_entry(file, &tr->events, list) { call = file->event_call; + + /* If a module is specified, skip events that are not that module */ + if (module && (!call->module || strcmp(module_name(call->module), module))) + continue; + name = trace_event_name(call); if (!name || !call->class || !call->class->reg) @@ -1069,16 +1342,24 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, ret = eret; } + /* + * If this is a module setting and nothing was found, + * check if the module was loaded. If it wasn't cache it. + */ + if (module && ret == -EINVAL && !eret) + ret = cache_mod(tr, module, set, match, sub, event); + return ret; } static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, - const char *sub, const char *event, int set) + const char *sub, const char *event, int set, + const char *mod) { int ret; mutex_lock(&event_mutex); - ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set); + ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set, mod); mutex_unlock(&event_mutex); return ret; @@ -1086,11 +1367,20 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) { - char *event = NULL, *sub = NULL, *match; + char *event = NULL, *sub = NULL, *match, *mod; int ret; if (!tr) return -ENOENT; + + /* Modules events can be appened with :mod:<module> */ + mod = strstr(buf, ":mod:"); + if (mod) { + *mod = '\0'; + /* move to the module name */ + mod += 5; + } + /* * The buf format can be <subsystem>:<event-name> * *:<event-name> means any event by that name. @@ -1113,9 +1403,13 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) sub = NULL; if (!strlen(event) || strcmp(event, "*") == 0) event = NULL; + } else if (mod) { + /* Allow wildcard for no length or star */ + if (!strlen(match) || strcmp(match, "*") == 0) + match = NULL; } - ret = __ftrace_set_clr_event(tr, match, sub, event, set); + ret = __ftrace_set_clr_event(tr, match, sub, event, set, mod); /* Put back the colon to allow this to be called again */ if (buf) @@ -1143,7 +1437,7 @@ int trace_set_clr_event(const char *system, const char *event, int set) if (!tr) return -ENODEV; - return __ftrace_set_clr_event(tr, NULL, system, event, set); + return __ftrace_set_clr_event(tr, NULL, system, event, set, NULL); } EXPORT_SYMBOL_GPL(trace_set_clr_event); @@ -1169,7 +1463,7 @@ int trace_array_set_clr_event(struct trace_array *tr, const char *system, return -ENOENT; set = (enable == true) ? 1 : 0; - return __ftrace_set_clr_event(tr, NULL, system, event, set); + return __ftrace_set_clr_event(tr, NULL, system, event, set, NULL); } EXPORT_SYMBOL_GPL(trace_array_set_clr_event); @@ -1256,37 +1550,78 @@ static void *t_start(struct seq_file *m, loff_t *pos) return file; } +enum set_event_iter_type { + SET_EVENT_FILE, + SET_EVENT_MOD, +}; + +struct set_event_iter { + enum set_event_iter_type type; + union { + struct trace_event_file *file; + struct event_mod_load *event_mod; + }; +}; + static void * s_next(struct seq_file *m, void *v, loff_t *pos) { - struct trace_event_file *file = v; + struct set_event_iter *iter = v; + struct trace_event_file *file; struct trace_array *tr = m->private; (*pos)++; - list_for_each_entry_continue(file, &tr->events, list) { - if (file->flags & EVENT_FILE_FL_ENABLED) - return file; + if (iter->type == SET_EVENT_FILE) { + file = iter->file; + list_for_each_entry_continue(file, &tr->events, list) { + if (file->flags & EVENT_FILE_FL_ENABLED) { + iter->file = file; + return iter; + } + } +#ifdef CONFIG_MODULES + iter->type = SET_EVENT_MOD; + iter->event_mod = list_entry(&tr->mod_events, struct event_mod_load, list); +#endif } +#ifdef CONFIG_MODULES + list_for_each_entry_continue(iter->event_mod, &tr->mod_events, list) + return iter; +#endif + + /* + * The iter is allocated in s_start() and passed via the 'v' + * parameter. To stop the iterator, NULL must be returned. But + * the return value is what the 'v' parameter in s_stop() receives + * and frees. Free iter here as it will no longer be used. + */ + kfree(iter); return NULL; } static void *s_start(struct seq_file *m, loff_t *pos) { - struct trace_event_file *file; struct trace_array *tr = m->private; + struct set_event_iter *iter; loff_t l; + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return NULL; + mutex_lock(&event_mutex); - file = list_entry(&tr->events, struct trace_event_file, list); + iter->type = SET_EVENT_FILE; + iter->file = list_entry(&tr->events, struct trace_event_file, list); + for (l = 0; l <= *pos; ) { - file = s_next(m, file, &l); - if (!file) + iter = s_next(m, iter, &l); + if (!iter) break; } - return file; + return iter; } static int t_show(struct seq_file *m, void *v) @@ -1306,6 +1641,45 @@ static void t_stop(struct seq_file *m, void *p) mutex_unlock(&event_mutex); } +#ifdef CONFIG_MODULES +static int s_show(struct seq_file *m, void *v) +{ + struct set_event_iter *iter = v; + const char *system; + const char *event; + + if (iter->type == SET_EVENT_FILE) + return t_show(m, iter->file); + + /* When match is set, system and event are not */ + if (iter->event_mod->match) { + seq_printf(m, "%s:mod:%s\n", iter->event_mod->match, + iter->event_mod->module); + return 0; + } + + system = iter->event_mod->system ? : "*"; + event = iter->event_mod->event ? : "*"; + + seq_printf(m, "%s:%s:mod:%s\n", system, event, iter->event_mod->module); + + return 0; +} +#else /* CONFIG_MODULES */ +static int s_show(struct seq_file *m, void *v) +{ + struct set_event_iter *iter = v; + + return t_show(m, iter->file); +} +#endif + +static void s_stop(struct seq_file *m, void *v) +{ + kfree(v); + t_stop(m, NULL); +} + static void * __next(struct seq_file *m, void *v, loff_t *pos, int type) { @@ -1386,12 +1760,12 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, char buf[4] = "0"; mutex_lock(&event_mutex); - file = event_file_data(filp); + file = event_file_file(filp); if (likely(file)) flags = file->flags; mutex_unlock(&event_mutex); - if (!file || flags & EVENT_FILE_FL_FREED) + if (!file) return -ENODEV; if (flags & EVENT_FILE_FL_ENABLED && @@ -1419,21 +1793,20 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret) return ret; + guard(mutex)(&event_mutex); + switch (val) { case 0: case 1: - ret = -ENODEV; - mutex_lock(&event_mutex); - file = event_file_data(filp); - if (likely(file && !(file->flags & EVENT_FILE_FL_FREED))) { - ret = tracing_update_buffers(file->tr); - if (ret < 0) { - mutex_unlock(&event_mutex); - return ret; - } - ret = ftrace_event_enable_disable(file, val); - } - mutex_unlock(&event_mutex); + file = event_file_file(filp); + if (!file) + return -ENODEV; + ret = tracing_update_buffers(file->tr); + if (ret < 0) + return ret; + ret = ftrace_event_enable_disable(file, val); + if (ret < 0) + return ret; break; default: @@ -1442,7 +1815,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, *ppos += cnt; - return ret ? ret : cnt; + return cnt; } static ssize_t @@ -1520,7 +1893,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (system) name = system->name; - ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val); + ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val, NULL); if (ret) goto out; @@ -1540,7 +1913,8 @@ enum { static void *f_next(struct seq_file *m, void *v, loff_t *pos) { - struct trace_event_call *call = event_file_data(m->private); + struct trace_event_file *file = event_file_data(m->private); + struct trace_event_call *call = file->event_call; struct list_head *common_head = &ftrace_common_fields; struct list_head *head = trace_get_fields(call); struct list_head *node = v; @@ -1572,7 +1946,8 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos) static int f_show(struct seq_file *m, void *v) { - struct trace_event_call *call = event_file_data(m->private); + struct trace_event_file *file = event_file_data(m->private); + struct trace_event_call *call = file->event_call; struct ftrace_event_field *field; const char *array_descriptor; @@ -1627,12 +2002,14 @@ static int f_show(struct seq_file *m, void *v) static void *f_start(struct seq_file *m, loff_t *pos) { + struct trace_event_file *file; void *p = (void *)FORMAT_HEADER; loff_t l = 0; /* ->stop() is called even if ->start() fails */ mutex_lock(&event_mutex); - if (!event_file_data(m->private)) + file = event_file_file(m->private); + if (!file) return ERR_PTR(-ENODEV); while (l < *pos && p) @@ -1670,6 +2047,7 @@ static int trace_format_open(struct inode *inode, struct file *file) return 0; } +#ifdef CONFIG_PERF_EVENTS static ssize_t event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -1684,6 +2062,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); } +#endif static ssize_t event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, @@ -1704,8 +2083,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); mutex_lock(&event_mutex); - file = event_file_data(filp); - if (file && !(file->flags & EVENT_FILE_FL_FREED)) + file = event_file_file(filp); + if (file) print_event_filter(file, s); mutex_unlock(&event_mutex); @@ -1734,9 +2113,13 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, return PTR_ERR(buf); mutex_lock(&event_mutex); - file = event_file_data(filp); - if (file) - err = apply_event_filter(file, buf); + file = event_file_file(filp); + if (file) { + if (file->flags & EVENT_FILE_FL_FREED) + err = -ENODEV; + else + err = apply_event_filter(file, buf); + } mutex_unlock(&event_mutex); kfree(buf); @@ -2008,7 +2391,7 @@ event_pid_write(struct file *filp, const char __user *ubuf, if (ret < 0) return ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); if (type == TRACE_PIDS) { filtered_pids = rcu_dereference_protected(tr->filtered_pids, @@ -2024,7 +2407,7 @@ event_pid_write(struct file *filp, const char __user *ubuf, ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); if (ret < 0) - goto out; + return ret; if (type == TRACE_PIDS) rcu_assign_pointer(tr->filtered_pids, pid_list); @@ -2049,11 +2432,7 @@ event_pid_write(struct file *filp, const char __user *ubuf, */ on_each_cpu(ignore_task_cpu, tr, 1); - out: - mutex_unlock(&event_mutex); - - if (ret > 0) - *ppos += ret; + *ppos += ret; return ret; } @@ -2088,8 +2467,8 @@ static const struct seq_operations show_event_seq_ops = { static const struct seq_operations show_set_event_seq_ops = { .start = s_start, .next = s_next, - .show = t_show, - .stop = t_stop, + .show = s_show, + .stop = s_stop, }; static const struct seq_operations show_set_pid_seq_ops = { @@ -2152,10 +2531,12 @@ static const struct file_operations ftrace_event_format_fops = { .release = seq_release, }; +#ifdef CONFIG_PERF_EVENTS static const struct file_operations ftrace_event_id_fops = { .read = event_id_read, .llseek = default_llseek, }; +#endif static const struct file_operations ftrace_event_filter_fops = { .open = tracing_open_file_tr, @@ -2459,7 +2840,7 @@ event_define_fields(struct trace_event_call *call) ret = trace_define_field_ext(call, field->type, field->name, offset, field->size, field->is_signed, field->filter_type, - field->len); + field->len, field->needs_test); if (WARN_ON_ONCE(ret)) { pr_err("error code is %d\n", ret); break; @@ -2481,7 +2862,6 @@ static int event_callback(const char *name, umode_t *mode, void **data, if (strcmp(name, "format") == 0) { *mode = TRACE_MODE_READ; *fops = &ftrace_event_format_fops; - *data = call; return 1; } @@ -2548,6 +2928,14 @@ static int event_callback(const char *name, umode_t *mode, void **data, return 0; } +/* The file is incremented on creation and freeing the enable file decrements it */ +static void event_release(const char *name, void *data) +{ + struct trace_event_file *file = data; + + event_file_put(file); +} + static int event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file) { @@ -2562,6 +2950,7 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file) { .name = "enable", .callback = event_callback, + .release = event_release, }, { .name = "filter", @@ -2630,6 +3019,9 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file) return ret; } + /* Gets decremented on freeing of the "enable" file */ + event_file_get(file); + return 0; } @@ -2949,6 +3341,20 @@ static bool event_in_systems(struct trace_event_call *call, return !*p || isspace(*p) || *p == ','; } +#ifdef CONFIG_HIST_TRIGGERS +/* + * Wake up waiter on the hist_poll_wq from irq_work because the hist trigger + * may happen in any context. + */ +static void hist_poll_event_irq_work(struct irq_work *work) +{ + wake_up_all(&hist_poll_wq); +} + +DEFINE_IRQ_WORK(hist_poll_work, hist_poll_event_irq_work); +DECLARE_WAIT_QUEUE_HEAD(hist_poll_wq); +#endif + static struct trace_event_file * trace_create_new_event(struct trace_event_call *call, struct trace_array *tr) @@ -2980,7 +3386,7 @@ trace_create_new_event(struct trace_event_call *call, atomic_set(&file->tm_ref, 0); INIT_LIST_HEAD(&file->triggers); list_add(&file->list, &tr->events); - event_file_get(file); + refcount_set(&file->ref, 1); return file; } @@ -3107,13 +3513,13 @@ int trace_add_event_call(struct trace_event_call *call) int ret; lockdep_assert_held(&event_mutex); - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); ret = __register_event(call, NULL); - if (ret >= 0) - __add_event_to_tracers(call); + if (ret < 0) + return ret; - mutex_unlock(&trace_types_lock); + __add_event_to_tracers(call); return ret; } EXPORT_SYMBOL_GPL(trace_add_event_call); @@ -3126,8 +3532,6 @@ static void __trace_remove_event_call(struct trace_event_call *call) { event_remove(call); trace_destroy_fields(call); - free_event_filter(call->filter); - call->filter = NULL; } static int probe_remove_event_call(struct trace_event_call *call) @@ -3195,6 +3599,28 @@ EXPORT_SYMBOL_GPL(trace_remove_event_call); event++) #ifdef CONFIG_MODULES +static void update_mod_cache(struct trace_array *tr, struct module *mod) +{ + struct event_mod_load *event_mod, *n; + + list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) { + if (strcmp(event_mod->module, mod->name) != 0) + continue; + + __ftrace_set_clr_event_nolock(tr, event_mod->match, + event_mod->system, + event_mod->event, 1, mod->name); + free_event_mod(event_mod); + } +} + +static void update_cache_events(struct module *mod) +{ + struct trace_array *tr; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) + update_mod_cache(tr, mod); +} static void trace_module_add_events(struct module *mod) { @@ -3217,6 +3643,8 @@ static void trace_module_add_events(struct module *mod) __register_event(*call, mod); __add_event_to_tracers(*call); } + + update_cache_events(mod); } static void trace_module_remove_events(struct module *mod) @@ -3369,30 +3797,21 @@ struct trace_event_file *trace_get_event_file(const char *instance, return ERR_PTR(ret); } - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); file = find_event_file(tr, system, event); if (!file) { trace_array_put(tr); - ret = -EINVAL; - goto out; + return ERR_PTR(-EINVAL); } /* Don't let event modules unload while in use */ ret = trace_event_try_get_ref(file->event_call); if (!ret) { trace_array_put(tr); - ret = -EBUSY; - goto out; + return ERR_PTR(-EBUSY); } - ret = 0; - out: - mutex_unlock(&event_mutex); - - if (ret) - file = ERR_PTR(ret); - return file; } EXPORT_SYMBOL_GPL(trace_get_event_file); @@ -3610,6 +4029,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, struct trace_event_file *file; struct ftrace_probe_ops *ops; struct event_probe_data *data; + unsigned long count = -1; const char *system; const char *event; char *number; @@ -3629,12 +4049,11 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, event = strsep(¶m, ":"); - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); - ret = -EINVAL; file = find_event_file(tr, system, event); if (!file) - goto out; + return -EINVAL; enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; @@ -3643,74 +4062,62 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, else ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; - if (glob[0] == '!') { - ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); - goto out; - } + if (glob[0] == '!') + return unregister_ftrace_function_probe_func(glob+1, tr, ops); - ret = -ENOMEM; + if (param) { + number = strsep(¶m, ":"); - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - goto out; - - data->enable = enable; - data->count = -1; - data->file = file; - - if (!param) - goto out_reg; - - number = strsep(¶m, ":"); - - ret = -EINVAL; - if (!strlen(number)) - goto out_free; + if (!strlen(number)) + return -EINVAL; - /* - * We use the callback data field (which is a pointer) - * as our counter. - */ - ret = kstrtoul(number, 0, &data->count); - if (ret) - goto out_free; + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &count); + if (ret) + return ret; + } - out_reg: /* Don't let event modules unload while probe registered */ ret = trace_event_try_get_ref(file->event_call); - if (!ret) { - ret = -EBUSY; - goto out_free; - } + if (!ret) + return -EBUSY; ret = __ftrace_event_enable_disable(file, 1, 1); if (ret < 0) goto out_put; + ret = -ENOMEM; + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out_put; + + data->enable = enable; + data->count = count; + data->file = file; + ret = register_ftrace_function_probe(glob, tr, ops, data); /* * The above returns on success the # of functions enabled, * but if it didn't find any functions it returns zero. * Consider no functions a failure too. */ - if (!ret) { - ret = -ENOENT; - goto out_disable; - } else if (ret < 0) - goto out_disable; + /* Just return zero, not the number of enabled functions */ - ret = 0; - out: - mutex_unlock(&event_mutex); - return ret; + if (ret > 0) + return 0; + + kfree(data); + + if (!ret) + ret = -ENOENT; - out_disable: __ftrace_event_enable_disable(file, 0, 1); out_put: trace_event_put_ref(file->event_call); - out_free: - kfree(data); - goto out; + return ret; } static struct ftrace_func_command event_enable_cmd = { @@ -3933,20 +4340,17 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr) { int ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); ret = create_event_toplevel_files(parent, tr); if (ret) - goto out_unlock; + return ret; down_write(&trace_event_sem); __trace_early_add_event_dirs(tr); up_write(&trace_event_sem); - out_unlock: - mutex_unlock(&event_mutex); - - return ret; + return 0; } /* Must be called with event_mutex held */ @@ -3961,7 +4365,7 @@ int event_trace_del_tracer(struct trace_array *tr) __ftrace_clear_event_pids(tr, TRACE_PIDS | TRACE_NO_PIDS); /* Disable any running events */ - __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); + __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0, NULL); /* Make sure no more events are being executed */ tracepoint_synchronize_unregister(); @@ -4245,7 +4649,7 @@ static __init void event_trace_self_tests(void) pr_info("Testing event system %s: ", system->name); - ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1); + ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1, NULL); if (WARN_ON_ONCE(ret)) { pr_warn("error enabling system %s\n", system->name); @@ -4254,7 +4658,7 @@ static __init void event_trace_self_tests(void) event_test_stuff(); - ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0); + ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0, NULL); if (WARN_ON_ONCE(ret)) { pr_warn("error disabling system %s\n", system->name); @@ -4269,7 +4673,7 @@ static __init void event_trace_self_tests(void) pr_info("Running tests on all trace events:\n"); pr_info("Testing all events: "); - ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1); + ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1, NULL); if (WARN_ON_ONCE(ret)) { pr_warn("error enabling all events\n"); return; @@ -4278,7 +4682,7 @@ static __init void event_trace_self_tests(void) event_test_stuff(); /* reset sysname */ - ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); + ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0, NULL); if (WARN_ON_ONCE(ret)) { pr_warn("error disabling all events\n"); return; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0c611b281a5b..0993dfc1c5c1 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1616,7 +1616,7 @@ static int parse_pred(const char *str, void *data, goto err_free; } - strncpy(num_buf, str + s, len); + memcpy(num_buf, str + s, len); num_buf[len] = 0; ret = kstrtoul(num_buf, 0, &ip); @@ -1694,7 +1694,7 @@ static int parse_pred(const char *str, void *data, if (!pred->regex) goto err_mem; pred->regex->len = len; - strncpy(pred->regex->pattern, str + s, len); + memcpy(pred->regex->pattern, str + s, len); pred->regex->pattern[len] = 0; } else if (!strncmp(str + i, "CPUS", 4)) { @@ -1859,7 +1859,7 @@ static int parse_pred(const char *str, void *data, if (!pred->regex) goto err_mem; pred->regex->len = len; - strncpy(pred->regex->pattern, str + s, len); + memcpy(pred->regex->pattern, str + s, len); pred->regex->pattern[len] = 0; filter_build_regex(pred); @@ -1919,7 +1919,7 @@ static int parse_pred(const char *str, void *data, goto err_free; } - strncpy(num_buf, str + s, len); + memcpy(num_buf, str + s, len); num_buf[len] = 0; /* Make sure it is a value */ @@ -2405,13 +2405,11 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, struct event_filter *filter = NULL; int err = 0; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); /* Make sure the system still has events */ - if (!dir->nr_events) { - err = -ENODEV; - goto out_unlock; - } + if (!dir->nr_events) + return -ENODEV; if (!strcmp(strstrip(filter_string), "0")) { filter_free_subsystem_preds(dir, tr); @@ -2422,7 +2420,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, tracepoint_synchronize_unregister(); filter_free_subsystem_filters(dir, tr); __free_filter(filter); - goto out_unlock; + return 0; } err = create_system_filter(dir, filter_string, &filter); @@ -2434,8 +2432,6 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, __free_filter(system->filter); system->filter = filter; } -out_unlock: - mutex_unlock(&event_mutex); return err; } @@ -2612,17 +2608,15 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, struct event_filter *filter = NULL; struct trace_event_call *call; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); call = event->tp_event; - err = -EINVAL; if (!call) - goto out_unlock; + return -EINVAL; - err = -EEXIST; if (event->filter) - goto out_unlock; + return -EEXIST; err = create_filter(NULL, call, filter_str, false, &filter); if (err) @@ -2637,9 +2631,6 @@ free_filter: if (err || ftrace_event_is_function(call)) __free_filter(filter); -out_unlock: - mutex_unlock(&event_mutex); - return err; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 6ece1308d36a..53dc6719181e 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -822,7 +822,7 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals, { struct tracepoint *tp = event->tp; - if (unlikely(atomic_read(&tp->key.enabled) > 0)) { + if (unlikely(static_key_enabled(&tp->key))) { struct tracepoint_func *probe_func_ptr; synth_probe_func_t probe_func; void *__data; @@ -1354,10 +1354,7 @@ static const char *hist_field_name(struct hist_field *field, } else if (field->flags & HIST_FIELD_FL_TIMESTAMP) field_name = "common_timestamp"; else if (field->flags & HIST_FIELD_FL_STACKTRACE) { - if (field->field) - field_name = field->field->name; - else - field_name = "common_stacktrace"; + field_name = "common_stacktrace"; } else if (field->flags & HIST_FIELD_FL_HITCOUNT) field_name = "hitcount"; @@ -1599,7 +1596,7 @@ static inline void save_comm(char *comm, struct task_struct *task) return; } - strncpy(comm, task->comm, TASK_COMM_LEN); + strscpy(comm, task->comm, TASK_COMM_LEN); } static void hist_elt_data_free(struct hist_elt_data *elt_data) @@ -3405,7 +3402,7 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) elt_data = context->elt->private_data; track_elt_data = track_data->elt.private_data; if (elt_data->comm) - strncpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN); + strscpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN); track_data->updated = true; @@ -5314,6 +5311,8 @@ static void event_hist_trigger(struct event_trigger_data *data, if (resolve_var_refs(hist_data, key, var_ref_vals, true)) hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals); + + hist_poll_wakeup(); } static void hist_trigger_stacktrace_print(struct seq_file *m, @@ -5593,49 +5592,137 @@ static void hist_trigger_show(struct seq_file *m, n_entries, (u64)atomic64_read(&hist_data->map->drops)); } +struct hist_file_data { + struct file *file; + u64 last_read; + u64 last_act; +}; + +static u64 get_hist_hit_count(struct trace_event_file *event_file) +{ + struct hist_trigger_data *hist_data; + struct event_trigger_data *data; + u64 ret = 0; + + list_for_each_entry(data, &event_file->triggers, list) { + if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) { + hist_data = data->private_data; + ret += atomic64_read(&hist_data->map->hits); + } + } + return ret; +} + static int hist_show(struct seq_file *m, void *v) { + struct hist_file_data *hist_file = m->private; struct event_trigger_data *data; struct trace_event_file *event_file; - int n = 0, ret = 0; + int n = 0; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); - event_file = event_file_data(m->private); - if (unlikely(!event_file)) { - ret = -ENODEV; - goto out_unlock; - } + event_file = event_file_file(hist_file->file); + if (unlikely(!event_file)) + return -ENODEV; list_for_each_entry(data, &event_file->triggers, list) { if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) hist_trigger_show(m, data, n++); } + hist_file->last_read = get_hist_hit_count(event_file); + /* + * Update last_act too so that poll()/POLLPRI can wait for the next + * event after any syscall on hist file. + */ + hist_file->last_act = hist_file->last_read; - out_unlock: - mutex_unlock(&event_mutex); + return 0; +} + +static __poll_t event_hist_poll(struct file *file, struct poll_table_struct *wait) +{ + struct trace_event_file *event_file; + struct seq_file *m = file->private_data; + struct hist_file_data *hist_file = m->private; + __poll_t ret = 0; + u64 cnt; + + guard(mutex)(&event_mutex); + + event_file = event_file_data(file); + if (!event_file) + return EPOLLERR; + + hist_poll_wait(file, wait); + + cnt = get_hist_hit_count(event_file); + if (hist_file->last_read != cnt) + ret |= EPOLLIN | EPOLLRDNORM; + if (hist_file->last_act != cnt) { + hist_file->last_act = cnt; + ret |= EPOLLPRI; + } return ret; } +static int event_hist_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + struct hist_file_data *hist_file = m->private; + + kfree(hist_file); + return tracing_single_release_file_tr(inode, file); +} + static int event_hist_open(struct inode *inode, struct file *file) { + struct trace_event_file *event_file; + struct hist_file_data *hist_file; int ret; ret = tracing_open_file_tr(inode, file); if (ret) return ret; + guard(mutex)(&event_mutex); + + event_file = event_file_data(file); + if (!event_file) { + ret = -ENODEV; + goto err; + } + + hist_file = kzalloc(sizeof(*hist_file), GFP_KERNEL); + if (!hist_file) { + ret = -ENOMEM; + goto err; + } + + hist_file->file = file; + hist_file->last_act = get_hist_hit_count(event_file); + /* Clear private_data to avoid warning in single_open() */ file->private_data = NULL; - return single_open(file, hist_show, file); + ret = single_open(file, hist_show, hist_file); + if (ret) { + kfree(hist_file); + goto err; + } + + return 0; +err: + tracing_release_file_tr(inode, file); + return ret; } const struct file_operations event_hist_fops = { .open = event_hist_open, .read = seq_read, .llseek = seq_lseek, - .release = tracing_single_release_file_tr, + .release = event_hist_release, + .poll = event_hist_poll, }; #ifdef CONFIG_HIST_TRIGGERS_DEBUG @@ -5876,25 +5963,19 @@ static int hist_debug_show(struct seq_file *m, void *v) { struct event_trigger_data *data; struct trace_event_file *event_file; - int n = 0, ret = 0; + int n = 0; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); - event_file = event_file_data(m->private); - if (unlikely(!event_file)) { - ret = -ENODEV; - goto out_unlock; - } + event_file = event_file_file(m->private); + if (unlikely(!event_file)) + return -ENODEV; list_for_each_entry(data, &event_file->triggers, list) { if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) hist_trigger_debug_show(m, data, n++); } - - out_unlock: - mutex_unlock(&event_mutex); - - return ret; + return 0; } static int event_hist_debug_open(struct inode *inode, struct file *file) @@ -5907,7 +5988,10 @@ static int event_hist_debug_open(struct inode *inode, struct file *file) /* Clear private_data to avoid warning in single_open() */ file->private_data = NULL; - return single_open(file, hist_debug_show, file); + ret = single_open(file, hist_debug_show, file); + if (ret) + tracing_release_file_tr(inode, file); + return ret; } const struct file_operations event_hist_debug_fops = { @@ -6652,27 +6736,27 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, if (existing_hist_update_only(glob, trigger_data, file)) goto out_free; - ret = event_trigger_register(cmd_ops, file, glob, trigger_data); - if (ret < 0) - goto out_free; + if (!get_named_trigger_data(trigger_data)) { - if (get_named_trigger_data(trigger_data)) - goto enable; + ret = create_actions(hist_data); + if (ret) + goto out_free; - ret = create_actions(hist_data); - if (ret) - goto out_unreg; + if (has_hist_vars(hist_data) || hist_data->n_var_refs) { + ret = save_hist_vars(hist_data); + if (ret) + goto out_free; + } - if (has_hist_vars(hist_data) || hist_data->n_var_refs) { - ret = save_hist_vars(hist_data); + ret = tracing_map_init(hist_data->map); if (ret) - goto out_unreg; + goto out_free; } - ret = tracing_map_init(hist_data->map); - if (ret) - goto out_unreg; -enable: + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); + if (ret < 0) + goto out_free; + ret = hist_trigger_enable(trigger_data, file); if (ret) goto out_unreg; diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c index 8650562bdaa9..a8f076809db4 100644 --- a/kernel/trace/trace_events_inject.c +++ b/kernel/trace/trace_events_inject.c @@ -299,7 +299,7 @@ event_inject_write(struct file *filp, const char __user *ubuf, size_t cnt, strim(buf); mutex_lock(&event_mutex); - file = event_file_data(filp); + file = event_file_file(filp); if (file) { call = file->event_call; size = parse_entry(buf, call, &entry); diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index c82b401a294d..e3f7d09e5512 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -49,16 +49,11 @@ static char *last_cmd; static int errpos(const char *str) { - int ret = 0; - - mutex_lock(&lastcmd_mutex); + guard(mutex)(&lastcmd_mutex); if (!str || !last_cmd) - goto out; + return 0; - ret = err_pos(last_cmd, str); - out: - mutex_unlock(&lastcmd_mutex); - return ret; + return err_pos(last_cmd, str); } static void last_cmd_set(const char *str) @@ -74,14 +69,12 @@ static void last_cmd_set(const char *str) static void synth_err(u8 err_type, u16 err_pos) { - mutex_lock(&lastcmd_mutex); + guard(mutex)(&lastcmd_mutex); if (!last_cmd) - goto out; + return; tracing_log_err(NULL, "synthetic_events", last_cmd, err_text, err_type, err_pos); - out: - mutex_unlock(&lastcmd_mutex); } static int create_synth_event(const char *raw_command); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index b33c3861fbbb..d45448947094 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -159,7 +159,7 @@ static void *trigger_start(struct seq_file *m, loff_t *pos) /* ->stop() is called even if ->start() fails */ mutex_lock(&event_mutex); - event_file = event_file_data(m->private); + event_file = event_file_file(m->private); if (unlikely(!event_file)) return ERR_PTR(-ENODEV); @@ -211,12 +211,10 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file) if (ret) return ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); - if (unlikely(!event_file_data(file))) { - mutex_unlock(&event_mutex); + if (unlikely(!event_file_file(file))) return -ENODEV; - } if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { @@ -239,8 +237,6 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file) } } - mutex_unlock(&event_mutex); - return ret; } @@ -248,7 +244,6 @@ int trigger_process_regex(struct trace_event_file *file, char *buff) { char *command, *next; struct event_command *p; - int ret = -EINVAL; next = buff = skip_spaces(buff); command = strsep(&next, ": \t"); @@ -259,17 +254,14 @@ int trigger_process_regex(struct trace_event_file *file, char *buff) } command = (command[0] != '!') ? command : command + 1; - mutex_lock(&trigger_cmd_mutex); + guard(mutex)(&trigger_cmd_mutex); + list_for_each_entry(p, &trigger_commands, list) { - if (strcmp(p->name, command) == 0) { - ret = p->parse(p, file, buff, command, next); - goto out_unlock; - } + if (strcmp(p->name, command) == 0) + return p->parse(p, file, buff, command, next); } - out_unlock: - mutex_unlock(&trigger_cmd_mutex); - return ret; + return -EINVAL; } static ssize_t event_trigger_regex_write(struct file *file, @@ -278,7 +270,7 @@ static ssize_t event_trigger_regex_write(struct file *file, { struct trace_event_file *event_file; ssize_t ret; - char *buf; + char *buf __free(kfree) = NULL; if (!cnt) return 0; @@ -292,24 +284,18 @@ static ssize_t event_trigger_regex_write(struct file *file, strim(buf); - mutex_lock(&event_mutex); - event_file = event_file_data(file); - if (unlikely(!event_file)) { - mutex_unlock(&event_mutex); - kfree(buf); + guard(mutex)(&event_mutex); + + event_file = event_file_file(file); + if (unlikely(!event_file)) return -ENODEV; - } - ret = trigger_process_regex(event_file, buf); - mutex_unlock(&event_mutex); - kfree(buf); + ret = trigger_process_regex(event_file, buf); if (ret < 0) - goto out; + return ret; *ppos += cnt; - ret = cnt; - out: - return ret; + return cnt; } static int event_trigger_regex_release(struct inode *inode, struct file *file) @@ -359,20 +345,16 @@ const struct file_operations event_trigger_fops = { __init int register_event_command(struct event_command *cmd) { struct event_command *p; - int ret = 0; - mutex_lock(&trigger_cmd_mutex); + guard(mutex)(&trigger_cmd_mutex); + list_for_each_entry(p, &trigger_commands, list) { - if (strcmp(cmd->name, p->name) == 0) { - ret = -EBUSY; - goto out_unlock; - } + if (strcmp(cmd->name, p->name) == 0) + return -EBUSY; } list_add(&cmd->list, &trigger_commands); - out_unlock: - mutex_unlock(&trigger_cmd_mutex); - return ret; + return 0; } /* @@ -382,20 +364,17 @@ __init int register_event_command(struct event_command *cmd) __init int unregister_event_command(struct event_command *cmd) { struct event_command *p, *n; - int ret = -ENODEV; - mutex_lock(&trigger_cmd_mutex); + guard(mutex)(&trigger_cmd_mutex); + list_for_each_entry_safe(p, n, &trigger_commands, list) { if (strcmp(cmd->name, p->name) == 0) { - ret = 0; list_del_init(&p->list); - goto out_unlock; + return 0; } } - out_unlock: - mutex_unlock(&trigger_cmd_mutex); - return ret; + return -ENODEV; } /** @@ -597,20 +576,12 @@ out: return ret; } -/** - * unregister_trigger - Generic event_command @unreg implementation - * @glob: The raw string used to register the trigger - * @test: Trigger-specific data used to find the trigger to remove - * @file: The trace_event_file associated with the event - * - * Common implementation for event trigger unregistration. - * - * Usually used directly as the @unreg method in event command - * implementations. +/* + * True if the trigger was found and unregistered, else false. */ -static void unregister_trigger(char *glob, - struct event_trigger_data *test, - struct trace_event_file *file) +static bool try_unregister_trigger(char *glob, + struct event_trigger_data *test, + struct trace_event_file *file) { struct event_trigger_data *data = NULL, *iter; @@ -626,8 +597,32 @@ static void unregister_trigger(char *glob, } } - if (data && data->ops->free) - data->ops->free(data); + if (data) { + if (data->ops->free) + data->ops->free(data); + + return true; + } + + return false; +} + +/** + * unregister_trigger - Generic event_command @unreg implementation + * @glob: The raw string used to register the trigger + * @test: Trigger-specific data used to find the trigger to remove + * @file: The trace_event_file associated with the event + * + * Common implementation for event trigger unregistration. + * + * Usually used directly as the @unreg method in event command + * implementations. + */ +static void unregister_trigger(char *glob, + struct event_trigger_data *test, + struct trace_event_file *file) +{ + try_unregister_trigger(glob, test, file); } /* @@ -1470,12 +1465,23 @@ register_snapshot_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { - int ret = tracing_alloc_snapshot_instance(file->tr); + int ret = tracing_arm_snapshot(file->tr); if (ret < 0) return ret; - return register_trigger(glob, data, file); + ret = register_trigger(glob, data, file); + if (ret < 0) + tracing_disarm_snapshot(file->tr); + return ret; +} + +static void unregister_snapshot_trigger(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + if (try_unregister_trigger(glob, data, file)) + tracing_disarm_snapshot(file->tr); } static int @@ -1510,7 +1516,7 @@ static struct event_command trigger_snapshot_cmd = { .trigger_type = ETT_SNAPSHOT, .parse = event_trigger_parse, .reg = register_snapshot_trigger, - .unreg = unregister_trigger, + .unreg = unregister_snapshot_trigger, .get_trigger_ops = snapshot_get_trigger_ops, .set_filter = set_trigger_filter, }; diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index e76f5e1efdf2..97325fbd6283 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -34,7 +34,8 @@ /* Limit how long of an event name plus args within the subsystem. */ #define MAX_EVENT_DESC 512 -#define EVENT_NAME(user_event) ((user_event)->tracepoint.name) +#define EVENT_NAME(user_event) ((user_event)->reg_name) +#define EVENT_TP_NAME(user_event) ((user_event)->tracepoint.name) #define MAX_FIELD_ARRAY_SIZE 1024 /* @@ -54,10 +55,13 @@ * allows isolation for events by various means. */ struct user_event_group { - char *system_name; - struct hlist_node node; - struct mutex reg_mutex; + char *system_name; + char *system_multi_name; + struct hlist_node node; + struct mutex reg_mutex; DECLARE_HASHTABLE(register_table, 8); + /* ID that moves forward within the group for multi-event names */ + u64 multi_id; }; /* Group for init_user_ns mapping, top-most group */ @@ -78,6 +82,7 @@ static unsigned int current_user_events; */ struct user_event { struct user_event_group *group; + char *reg_name; struct tracepoint tracepoint; struct trace_event_call call; struct trace_event_class class; @@ -127,6 +132,8 @@ struct user_event_enabler { #define ENABLE_BIT(e) ((int)((e)->values & ENABLE_VAL_BIT_MASK)) +#define EVENT_MULTI_FORMAT(f) ((f) & USER_EVENT_REG_MULTI_FORMAT) + /* Used for asynchronous faulting in of pages */ struct user_event_enabler_fault { struct work_struct work; @@ -202,6 +209,8 @@ static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm); static struct user_event_mm *user_event_mm_get_all(struct user_event *user); static void user_event_mm_put(struct user_event_mm *mm); static int destroy_user_event(struct user_event *user); +static bool user_fields_match(struct user_event *user, int argc, + const char **argv); static u32 user_event_key(char *name) { @@ -328,6 +337,7 @@ out: static void user_event_group_destroy(struct user_event_group *group) { kfree(group->system_name); + kfree(group->system_multi_name); kfree(group); } @@ -346,6 +356,11 @@ static char *user_event_group_system_name(void) return system_name; } +static char *user_event_group_system_multi_name(void) +{ + return kstrdup(USER_EVENTS_MULTI_SYSTEM, GFP_KERNEL); +} + static struct user_event_group *current_user_event_group(void) { return init_group; @@ -365,6 +380,11 @@ static struct user_event_group *user_event_group_create(void) if (!group->system_name) goto error; + group->system_multi_name = user_event_group_system_multi_name(); + + if (!group->system_multi_name) + goto error; + mutex_init(&group->reg_mutex); hash_init(group->register_table); @@ -1480,6 +1500,11 @@ static int destroy_user_event(struct user_event *user) hash_del(&user->node); user_event_destroy_validators(user); + + /* If we have different names, both must be freed */ + if (EVENT_NAME(user) != EVENT_TP_NAME(user)) + kfree(EVENT_TP_NAME(user)); + kfree(user->call.print_fmt); kfree(EVENT_NAME(user)); kfree(user); @@ -1493,17 +1518,36 @@ static int destroy_user_event(struct user_event *user) } static struct user_event *find_user_event(struct user_event_group *group, - char *name, u32 *outkey) + char *name, int argc, const char **argv, + u32 flags, u32 *outkey) { struct user_event *user; u32 key = user_event_key(name); *outkey = key; - hash_for_each_possible(group->register_table, user, node, key) - if (!strcmp(EVENT_NAME(user), name)) + hash_for_each_possible(group->register_table, user, node, key) { + /* + * Single-format events shouldn't return multi-format + * events. Callers expect the underlying tracepoint to match + * the name exactly in these cases. Only check like-formats. + */ + if (EVENT_MULTI_FORMAT(flags) != EVENT_MULTI_FORMAT(user->reg_flags)) + continue; + + if (strcmp(EVENT_NAME(user), name)) + continue; + + if (user_fields_match(user, argc, argv)) return user_event_get(user); + /* Scan others if this is a multi-format event */ + if (EVENT_MULTI_FORMAT(flags)) + continue; + + return ERR_PTR(-EADDRINUSE); + } + return NULL; } @@ -1632,7 +1676,7 @@ static void update_enable_bit_for(struct user_event *user) struct tracepoint *tp = &user->tracepoint; char status = 0; - if (atomic_read(&tp->key.enabled) > 0) { + if (static_key_enabled(&tp->key)) { struct tracepoint_func *probe_func_ptr; user_event_func_t probe_func; @@ -1860,6 +1904,9 @@ static bool user_fields_match(struct user_event *user, int argc, struct list_head *head = &user->fields; int i = 0; + if (argc == 0) + return list_empty(head); + list_for_each_entry_reverse(field, head, link) { if (!user_field_match(field, argc, argv, &i)) return false; @@ -1877,13 +1924,15 @@ static bool user_event_match(const char *system, const char *event, struct user_event *user = container_of(ev, struct user_event, devent); bool match; - match = strcmp(EVENT_NAME(user), event) == 0 && - (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0); + match = strcmp(EVENT_NAME(user), event) == 0; - if (match && argc > 0) + if (match && system) { + match = strcmp(system, user->group->system_name) == 0 || + strcmp(system, user->group->system_multi_name) == 0; + } + + if (match) match = user_fields_match(user, argc, argv); - else if (match && argc == 0) - match = list_empty(&user->fields); return match; } @@ -1913,6 +1962,107 @@ static int user_event_trace_register(struct user_event *user) return ret; } +static int user_event_set_tp_name(struct user_event *user) +{ + lockdep_assert_held(&user->group->reg_mutex); + + if (EVENT_MULTI_FORMAT(user->reg_flags)) { + char *multi_name; + + multi_name = kasprintf(GFP_KERNEL_ACCOUNT, "%s.%llx", + user->reg_name, user->group->multi_id); + + if (!multi_name) + return -ENOMEM; + + user->call.name = multi_name; + user->tracepoint.name = multi_name; + + /* Inc to ensure unique multi-event name next time */ + user->group->multi_id++; + } else { + /* Non Multi-format uses register name */ + user->call.name = user->reg_name; + user->tracepoint.name = user->reg_name; + } + + return 0; +} + +/* + * Counts how many ';' without a trailing space are in the args. + */ +static int count_semis_no_space(char *args) +{ + int count = 0; + + while ((args = strchr(args, ';'))) { + args++; + + if (!isspace(*args)) + count++; + } + + return count; +} + +/* + * Copies the arguments while ensuring all ';' have a trailing space. + */ +static char *insert_space_after_semis(char *args, int count) +{ + char *fixed, *pos; + int len; + + len = strlen(args) + count; + fixed = kmalloc(len + 1, GFP_KERNEL); + + if (!fixed) + return NULL; + + pos = fixed; + + /* Insert a space after ';' if there is no trailing space. */ + while (*args) { + *pos = *args++; + + if (*pos++ == ';' && !isspace(*args)) + *pos++ = ' '; + } + + *pos = '\0'; + + return fixed; +} + +static char **user_event_argv_split(char *args, int *argc) +{ + char **split; + char *fixed; + int count; + + /* Count how many ';' without a trailing space */ + count = count_semis_no_space(args); + + /* No fixup is required */ + if (!count) + return argv_split(GFP_KERNEL, args, argc); + + /* We must fixup 'field;field' to 'field; field' */ + fixed = insert_space_after_semis(args, count); + + if (!fixed) + return NULL; + + /* We do a normal split afterwards */ + split = argv_split(GFP_KERNEL, fixed, argc); + + /* We can free since argv_split makes a copy */ + kfree(fixed); + + return split; +} + /* * Parses the event name, arguments and flags then registers if successful. * The name buffer lifetime is owned by this method for success cases only. @@ -1922,11 +2072,11 @@ static int user_event_parse(struct user_event_group *group, char *name, char *args, char *flags, struct user_event **newuser, int reg_flags) { - int ret; - u32 key; struct user_event *user; + char **argv = NULL; int argc = 0; - char **argv; + int ret; + u32 key; /* Currently don't support any text based flags */ if (flags != NULL) @@ -1935,41 +2085,34 @@ static int user_event_parse(struct user_event_group *group, char *name, if (!user_event_capable(reg_flags)) return -EPERM; + if (args) { + argv = user_event_argv_split(args, &argc); + + if (!argv) + return -ENOMEM; + } + /* Prevent dyn_event from racing */ mutex_lock(&event_mutex); - user = find_user_event(group, name, &key); + user = find_user_event(group, name, argc, (const char **)argv, + reg_flags, &key); mutex_unlock(&event_mutex); - if (user) { - if (args) { - argv = argv_split(GFP_KERNEL, args, &argc); - if (!argv) { - ret = -ENOMEM; - goto error; - } + if (argv) + argv_free(argv); - ret = user_fields_match(user, argc, (const char **)argv); - argv_free(argv); - - } else - ret = list_empty(&user->fields); - - if (ret) { - *newuser = user; - /* - * Name is allocated by caller, free it since it already exists. - * Caller only worries about failure cases for freeing. - */ - kfree(name); - } else { - ret = -EADDRINUSE; - goto error; - } + if (IS_ERR(user)) + return PTR_ERR(user); + + if (user) { + *newuser = user; + /* + * Name is allocated by caller, free it since it already exists. + * Caller only worries about failure cases for freeing. + */ + kfree(name); return 0; -error: - user_event_put(user, false); - return ret; } user = kzalloc(sizeof(*user), GFP_KERNEL_ACCOUNT); @@ -1982,7 +2125,13 @@ error: INIT_LIST_HEAD(&user->validators); user->group = group; - user->tracepoint.name = name; + user->reg_name = name; + user->reg_flags = reg_flags; + + ret = user_event_set_tp_name(user); + + if (ret) + goto put_user; ret = user_event_parse_fields(user, args); @@ -1996,11 +2145,14 @@ error: user->call.data = user; user->call.class = &user->class; - user->call.name = name; user->call.flags = TRACE_EVENT_FL_TRACEPOINT; user->call.tp = &user->tracepoint; user->call.event.funcs = &user_event_funcs; - user->class.system = group->system_name; + + if (EVENT_MULTI_FORMAT(user->reg_flags)) + user->class.system = group->system_multi_name; + else + user->class.system = group->system_name; user->class.fields_array = user_event_fields_array; user->class.get_fields = user_event_get_fields; @@ -2022,8 +2174,6 @@ error: if (ret) goto put_user_lock; - user->reg_flags = reg_flags; - if (user->reg_flags & USER_EVENT_REG_PERSIST) { /* Ensure we track self ref and caller ref (2) */ refcount_set(&user->refcnt, 2); @@ -2047,30 +2197,43 @@ put_user: user_event_destroy_fields(user); user_event_destroy_validators(user); kfree(user->call.print_fmt); + + /* Caller frees reg_name on error, but not multi-name */ + if (EVENT_NAME(user) != EVENT_TP_NAME(user)) + kfree(EVENT_TP_NAME(user)); + kfree(user); return ret; } /* - * Deletes a previously created event if it is no longer being used. + * Deletes previously created events if they are no longer being used. */ static int delete_user_event(struct user_event_group *group, char *name) { - u32 key; - struct user_event *user = find_user_event(group, name, &key); + struct user_event *user; + struct hlist_node *tmp; + u32 key = user_event_key(name); + int ret = -ENOENT; - if (!user) - return -ENOENT; + /* Attempt to delete all event(s) with the name passed in */ + hash_for_each_possible_safe(group->register_table, user, tmp, node, key) { + if (strcmp(EVENT_NAME(user), name)) + continue; - user_event_put(user, true); + if (!user_event_last_ref(user)) + return -EBUSY; - if (!user_event_last_ref(user)) - return -EBUSY; + if (!user_event_capable(user->reg_flags)) + return -EPERM; - if (!user_event_capable(user->reg_flags)) - return -EPERM; + ret = destroy_user_event(user); - return destroy_user_event(user); + if (ret) + goto out; + } +out: + return ret; } /* @@ -2117,7 +2280,7 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) * It's possible key.enabled disables after this check, however * we don't mind if a few events are included in this condition. */ - if (likely(atomic_read(&tp->key.enabled) > 0)) { + if (likely(static_key_enabled(&tp->key))) { struct tracepoint_func *probe_func_ptr; user_event_func_t probe_func; struct iov_iter copy; @@ -2628,7 +2791,7 @@ static int user_seq_show(struct seq_file *m, void *p) hash_for_each(group->register_table, i, user, node) { status = user->status; - seq_printf(m, "%s", EVENT_NAME(user)); + seq_printf(m, "%s", EVENT_TP_NAME(user)); if (status != 0) seq_puts(m, " #"); @@ -2722,7 +2885,7 @@ err: return -ENODEV; } -static int set_max_user_events_sysctl(struct ctl_table *table, int write, +static int set_max_user_events_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -2736,7 +2899,7 @@ static int set_max_user_events_sysctl(struct ctl_table *table, int write, return ret; } -static struct ctl_table user_event_sysctls[] = { +static const struct ctl_table user_event_sysctls[] = { { .procname = "user_events_max", .data = &max_user_events, @@ -2744,7 +2907,6 @@ static struct ctl_table user_event_sysctls[] = { .mode = 0644, .proc_handler = set_max_user_events_sysctl, }, - {} }; static int __init trace_events_user_init(void) diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 7d2ddbcfa377..985ff98272da 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -4,6 +4,7 @@ * Copyright (C) 2022 Google LLC. */ #define pr_fmt(fmt) "trace_fprobe: " fmt +#include <asm/ptrace.h> #include <linux/fprobe.h> #include <linux/module.h> @@ -20,6 +21,7 @@ #define FPROBE_EVENT_SYSTEM "fprobes" #define TRACEPOINT_EVENT_SYSTEM "tracepoints" #define RETHOOK_MAXACTIVE_MAX 4096 +#define TRACEPOINT_STUB ERR_PTR(-ENOENT) static int trace_fprobe_create(const char *raw_command); static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev); @@ -129,10 +131,10 @@ static bool trace_fprobe_is_registered(struct trace_fprobe *tf) * from user space. */ static int -process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, - void *base) +process_fetch_insn(struct fetch_insn *code, void *rec, void *edata, + void *dest, void *base) { - struct pt_regs *regs = rec; + struct ftrace_regs *fregs = rec; unsigned long val; int ret; @@ -140,17 +142,20 @@ retry: /* 1st stage: get value from context */ switch (code->op) { case FETCH_OP_STACK: - val = regs_get_kernel_stack_nth(regs, code->param); + val = ftrace_regs_get_kernel_stack_nth(fregs, code->param); break; case FETCH_OP_STACKP: - val = kernel_stack_pointer(regs); + val = ftrace_regs_get_stack_pointer(fregs); break; case FETCH_OP_RETVAL: - val = regs_return_value(regs); + val = ftrace_regs_get_return_value(fregs); break; #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API case FETCH_OP_ARG: - val = regs_get_kernel_argument(regs, code->param); + val = ftrace_regs_get_argument(fregs, code->param); + break; + case FETCH_OP_EDATA: + val = *(unsigned long *)((unsigned long)edata + code->offset); break; #endif case FETCH_NOP_SYMBOL: /* Ignore a place holder */ @@ -170,7 +175,7 @@ NOKPROBE_SYMBOL(process_fetch_insn) /* function entry handler */ static nokprobe_inline void __fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, - struct pt_regs *regs, + struct ftrace_regs *fregs, struct trace_event_file *trace_file) { struct fentry_trace_entry_head *entry; @@ -184,37 +189,80 @@ __fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, if (trace_trigger_soft_disabled(trace_file)) return; - dsize = __get_data_size(&tf->tp, regs); + dsize = __get_data_size(&tf->tp, fregs, NULL); entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry) + tf->tp.size + dsize); if (!entry) return; - fbuffer.regs = regs; + fbuffer.regs = ftrace_get_regs(fregs); entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); entry->ip = entry_ip; - store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tf->tp, fregs, NULL, sizeof(*entry), dsize); trace_event_buffer_commit(&fbuffer); } static void fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, - struct pt_regs *regs) + struct ftrace_regs *fregs) { struct event_file_link *link; trace_probe_for_each_link_rcu(link, &tf->tp) - __fentry_trace_func(tf, entry_ip, regs, link->file); + __fentry_trace_func(tf, entry_ip, fregs, link->file); } NOKPROBE_SYMBOL(fentry_trace_func); -/* Kretprobe handler */ +static nokprobe_inline +void store_fprobe_entry_data(void *edata, struct trace_probe *tp, struct ftrace_regs *fregs) +{ + struct probe_entry_arg *earg = tp->entry_arg; + unsigned long val = 0; + int i; + + if (!earg) + return; + + for (i = 0; i < earg->size; i++) { + struct fetch_insn *code = &earg->code[i]; + + switch (code->op) { + case FETCH_OP_ARG: + val = ftrace_regs_get_argument(fregs, code->param); + break; + case FETCH_OP_ST_EDATA: + *(unsigned long *)((unsigned long)edata + code->offset) = val; + break; + case FETCH_OP_END: + goto end; + default: + break; + } + } +end: + return; +} + +/* function exit handler */ +static int trace_fprobe_entry_handler(struct fprobe *fp, unsigned long entry_ip, + unsigned long ret_ip, struct ftrace_regs *fregs, + void *entry_data) +{ + struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); + + if (tf->tp.entry_arg) + store_fprobe_entry_data(entry_data, &tf->tp, fregs); + + return 0; +} +NOKPROBE_SYMBOL(trace_fprobe_entry_handler) + static nokprobe_inline void __fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, - struct trace_event_file *trace_file) + unsigned long ret_ip, struct ftrace_regs *fregs, + void *entry_data, struct trace_event_file *trace_file) { struct fexit_trace_entry_head *entry; struct trace_event_buffer fbuffer; @@ -227,60 +275,63 @@ __fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, if (trace_trigger_soft_disabled(trace_file)) return; - dsize = __get_data_size(&tf->tp, regs); + dsize = __get_data_size(&tf->tp, fregs, entry_data); entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry) + tf->tp.size + dsize); if (!entry) return; - fbuffer.regs = regs; + fbuffer.regs = ftrace_get_regs(fregs); entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); entry->func = entry_ip; entry->ret_ip = ret_ip; - store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tf->tp, fregs, entry_data, sizeof(*entry), dsize); trace_event_buffer_commit(&fbuffer); } static void fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs) + unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data) { struct event_file_link *link; trace_probe_for_each_link_rcu(link, &tf->tp) - __fexit_trace_func(tf, entry_ip, ret_ip, regs, link->file); + __fexit_trace_func(tf, entry_ip, ret_ip, fregs, entry_data, link->file); } NOKPROBE_SYMBOL(fexit_trace_func); #ifdef CONFIG_PERF_EVENTS static int fentry_perf_func(struct trace_fprobe *tf, unsigned long entry_ip, - struct pt_regs *regs) + struct ftrace_regs *fregs) { struct trace_event_call *call = trace_probe_event_call(&tf->tp); struct fentry_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; + struct pt_regs *regs; int rctx; head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) return 0; - dsize = __get_data_size(&tf->tp, regs); + dsize = __get_data_size(&tf->tp, fregs, NULL); __size = sizeof(*entry) + tf->tp.size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - entry = perf_trace_buf_alloc(size, NULL, &rctx); + entry = perf_trace_buf_alloc(size, ®s, &rctx); if (!entry) return 0; + regs = ftrace_fill_perf_regs(fregs, regs); + entry->ip = entry_ip; memset(&entry[1], 0, dsize); - store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tf->tp, fregs, NULL, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); return 0; @@ -289,30 +340,34 @@ NOKPROBE_SYMBOL(fentry_perf_func); static void fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs) + unsigned long ret_ip, struct ftrace_regs *fregs, + void *entry_data) { struct trace_event_call *call = trace_probe_event_call(&tf->tp); struct fexit_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; + struct pt_regs *regs; int rctx; head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) return; - dsize = __get_data_size(&tf->tp, regs); + dsize = __get_data_size(&tf->tp, fregs, entry_data); __size = sizeof(*entry) + tf->tp.size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - entry = perf_trace_buf_alloc(size, NULL, &rctx); + entry = perf_trace_buf_alloc(size, ®s, &rctx); if (!entry) return; + regs = ftrace_fill_perf_regs(fregs, regs); + entry->func = entry_ip; entry->ret_ip = ret_ip; - store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tf->tp, fregs, entry_data, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); } @@ -320,33 +375,34 @@ NOKPROBE_SYMBOL(fexit_perf_func); #endif /* CONFIG_PERF_EVENTS */ static int fentry_dispatcher(struct fprobe *fp, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data) { struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); int ret = 0; if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE)) - fentry_trace_func(tf, entry_ip, regs); + fentry_trace_func(tf, entry_ip, fregs); + #ifdef CONFIG_PERF_EVENTS if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE)) - ret = fentry_perf_func(tf, entry_ip, regs); + ret = fentry_perf_func(tf, entry_ip, fregs); #endif return ret; } NOKPROBE_SYMBOL(fentry_dispatcher); static void fexit_dispatcher(struct fprobe *fp, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data) { struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE)) - fexit_trace_func(tf, entry_ip, ret_ip, regs); + fexit_trace_func(tf, entry_ip, ret_ip, fregs, entry_data); #ifdef CONFIG_PERF_EVENTS if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE)) - fexit_perf_func(tf, entry_ip, ret_ip, regs); + fexit_perf_func(tf, entry_ip, ret_ip, fregs, entry_data); #endif } NOKPROBE_SYMBOL(fexit_dispatcher); @@ -360,6 +416,9 @@ static void free_trace_fprobe(struct trace_fprobe *tf) } } +/* Since alloc_trace_fprobe() can return error, check the pointer is ERR too. */ +DEFINE_FREE(free_trace_fprobe, struct trace_fprobe *, if (!IS_ERR_OR_NULL(_T)) free_trace_fprobe(_T)) + /* * Allocate new trace_probe and initialize it (including fprobe). */ @@ -367,10 +426,10 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group, const char *event, const char *symbol, struct tracepoint *tpoint, - int maxactive, + struct module *mod, int nargs, bool is_return) { - struct trace_fprobe *tf; + struct trace_fprobe *tf __free(free_trace_fprobe) = NULL; int ret = -ENOMEM; tf = kzalloc(struct_size(tf, tp.args, nargs), GFP_KERNEL); @@ -379,7 +438,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group, tf->symbol = kstrdup(symbol, GFP_KERNEL); if (!tf->symbol) - goto error; + return ERR_PTR(-ENOMEM); if (is_return) tf->fp.exit_handler = fexit_dispatcher; @@ -387,17 +446,14 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group, tf->fp.entry_handler = fentry_dispatcher; tf->tpoint = tpoint; - tf->fp.nr_maxactive = maxactive; + tf->mod = mod; - ret = trace_probe_init(&tf->tp, event, group, false); + ret = trace_probe_init(&tf->tp, event, group, false, nargs); if (ret < 0) - goto error; + return ERR_PTR(ret); dyn_event_init(&tf->devent, &trace_fprobe_ops); - return tf; -error: - free_trace_fprobe(tf); - return ERR_PTR(ret); + return_ptr(tf); } static struct trace_fprobe *find_trace_fprobe(const char *event, @@ -654,6 +710,24 @@ static int unregister_fprobe_event(struct trace_fprobe *tf) return trace_probe_unregister_event_call(&tf->tp); } +static int __regsiter_tracepoint_fprobe(struct trace_fprobe *tf) +{ + struct tracepoint *tpoint = tf->tpoint; + unsigned long ip = (unsigned long)tpoint->probestub; + int ret; + + /* + * Here, we do 2 steps to enable fprobe on a tracepoint. + * At first, put __probestub_##TP function on the tracepoint + * and put a fprobe on the stub function. + */ + ret = tracepoint_probe_register_prio_may_exist(tpoint, + tpoint->probestub, NULL, 0); + if (ret < 0) + return ret; + return register_fprobe_ips(&tf->fp, &ip, 1); +} + /* Internal register function - just handle fprobe and flags */ static int __register_trace_fprobe(struct trace_fprobe *tf) { @@ -680,18 +754,12 @@ static int __register_trace_fprobe(struct trace_fprobe *tf) tf->fp.flags |= FPROBE_FL_DISABLED; if (trace_fprobe_is_tracepoint(tf)) { - struct tracepoint *tpoint = tf->tpoint; - unsigned long ip = (unsigned long)tpoint->probestub; - /* - * Here, we do 2 steps to enable fprobe on a tracepoint. - * At first, put __probestub_##TP function on the tracepoint - * and put a fprobe on the stub function. - */ - ret = tracepoint_probe_register_prio_may_exist(tpoint, - tpoint->probestub, NULL, 0); - if (ret < 0) - return ret; - return register_fprobe_ips(&tf->fp, &ip, 1); + + /* This tracepoint is not loaded yet */ + if (tf->tpoint == TRACEPOINT_STUB) + return 0; + + return __regsiter_tracepoint_fprobe(tf); } /* TODO: handle filter, nofilter or symbol list */ @@ -812,14 +880,12 @@ static int register_trace_fprobe(struct trace_fprobe *tf) struct trace_fprobe *old_tf; int ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); old_tf = find_trace_fprobe(trace_probe_name(&tf->tp), trace_probe_group_name(&tf->tp)); - if (old_tf) { - ret = append_trace_fprobe(tf, old_tf); - goto end; - } + if (old_tf) + return append_trace_fprobe(tf, old_tf); /* Register new event */ ret = register_fprobe_event(tf); @@ -829,7 +895,7 @@ static int register_trace_fprobe(struct trace_fprobe *tf) trace_probe_log_err(0, EVENT_EXIST); } else pr_warn("Failed to register probe event(%d)\n", ret); - goto end; + return ret; } /* Register fprobe */ @@ -839,29 +905,106 @@ static int register_trace_fprobe(struct trace_fprobe *tf) else dyn_event_add(&tf->devent, trace_probe_event_call(&tf->tp)); -end: - mutex_unlock(&event_mutex); return ret; } +struct __find_tracepoint_cb_data { + const char *tp_name; + struct tracepoint *tpoint; + struct module *mod; +}; + +static void __find_tracepoint_module_cb(struct tracepoint *tp, struct module *mod, void *priv) +{ + struct __find_tracepoint_cb_data *data = priv; + + if (!data->tpoint && !strcmp(data->tp_name, tp->name)) { + data->tpoint = tp; + if (!data->mod) + data->mod = mod; + } +} + +static void __find_tracepoint_cb(struct tracepoint *tp, void *priv) +{ + struct __find_tracepoint_cb_data *data = priv; + + if (!data->tpoint && !strcmp(data->tp_name, tp->name)) + data->tpoint = tp; +} + +/* Find a tracepoint from kernel and module. */ +static struct tracepoint *find_tracepoint(const char *tp_name, + struct module **tp_mod) +{ + struct __find_tracepoint_cb_data data = { + .tp_name = tp_name, + .mod = NULL, + }; + + for_each_kernel_tracepoint(__find_tracepoint_cb, &data); + + if (!data.tpoint && IS_ENABLED(CONFIG_MODULES)) { + for_each_module_tracepoint(__find_tracepoint_module_cb, &data); + *tp_mod = data.mod; + } + + return data.tpoint; +} + #ifdef CONFIG_MODULES +static void reenable_trace_fprobe(struct trace_fprobe *tf) +{ + struct trace_probe *tp = &tf->tp; + + list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { + __enable_trace_fprobe(tf); + } +} + +/* Find a tracepoint from specified module. */ +static struct tracepoint *find_tracepoint_in_module(struct module *mod, + const char *tp_name) +{ + struct __find_tracepoint_cb_data data = { + .tp_name = tp_name, + .mod = mod, + }; + + for_each_tracepoint_in_module(mod, __find_tracepoint_module_cb, &data); + return data.tpoint; +} + static int __tracepoint_probe_module_cb(struct notifier_block *self, unsigned long val, void *data) { struct tp_module *tp_mod = data; + struct tracepoint *tpoint; struct trace_fprobe *tf; struct dyn_event *pos; - if (val != MODULE_STATE_GOING) + if (val != MODULE_STATE_GOING && val != MODULE_STATE_COMING) return NOTIFY_DONE; mutex_lock(&event_mutex); for_each_trace_fprobe(tf, pos) { - if (tp_mod->mod == tf->mod) { - tracepoint_probe_unregister(tf->tpoint, + if (val == MODULE_STATE_COMING && tf->tpoint == TRACEPOINT_STUB) { + tpoint = find_tracepoint_in_module(tp_mod->mod, tf->symbol); + if (tpoint) { + tf->tpoint = tpoint; + tf->mod = tp_mod->mod; + if (!WARN_ON_ONCE(__regsiter_tracepoint_fprobe(tf)) && + trace_probe_is_enabled(&tf->tp)) + reenable_trace_fprobe(tf); + } + } else if (val == MODULE_STATE_GOING && tp_mod->mod == tf->mod) { + unregister_fprobe(&tf->fp); + if (trace_fprobe_is_tracepoint(tf)) { + tracepoint_probe_unregister(tf->tpoint, tf->tpoint->probestub, NULL); - tf->tpoint = NULL; - tf->mod = NULL; + tf->tpoint = TRACEPOINT_STUB; + tf->mod = NULL; + } } } mutex_unlock(&event_mutex); @@ -874,30 +1017,6 @@ static struct notifier_block tracepoint_module_nb = { }; #endif /* CONFIG_MODULES */ -struct __find_tracepoint_cb_data { - const char *tp_name; - struct tracepoint *tpoint; -}; - -static void __find_tracepoint_cb(struct tracepoint *tp, void *priv) -{ - struct __find_tracepoint_cb_data *data = priv; - - if (!data->tpoint && !strcmp(data->tp_name, tp->name)) - data->tpoint = tp; -} - -static struct tracepoint *find_tracepoint(const char *tp_name) -{ - struct __find_tracepoint_cb_data data = { - .tp_name = tp_name, - }; - - for_each_kernel_tracepoint(__find_tracepoint_cb, &data); - - return data.tpoint; -} - static int parse_symbol_and_return(int argc, const char *argv[], char **symbol, bool *is_return, bool is_tracepoint) @@ -923,6 +1042,19 @@ static int parse_symbol_and_return(int argc, const char *argv[], if (*is_return) return 0; + if (is_tracepoint) { + tmp = *symbol; + while (*tmp && (isalnum(*tmp) || *tmp == '_')) + tmp++; + if (*tmp) { + /* find a wrong character. */ + trace_probe_log_err(tmp - *symbol, BAD_TP_NAME); + kfree(*symbol); + *symbol = NULL; + return -EINVAL; + } + } + /* If there is $retval, this should be a return fprobe. */ for (i = 2; i < argc; i++) { tmp = strstr(argv[i], "$retval"); @@ -930,6 +1062,8 @@ static int parse_symbol_and_return(int argc, const char *argv[], if (is_tracepoint) { trace_probe_log_set_index(i); trace_probe_log_err(tmp - argv[i], RETVAL_ON_PROBE); + kfree(*symbol); + *symbol = NULL; return -EINVAL; } *is_return = true; @@ -939,7 +1073,10 @@ static int parse_symbol_and_return(int argc, const char *argv[], return 0; } -static int __trace_fprobe_create(int argc, const char *argv[]) +DEFINE_FREE(module_put, struct module *, if (_T) module_put(_T)) + +static int trace_fprobe_create_internal(int argc, const char *argv[], + struct traceprobe_parse_context *ctx) { /* * Argument syntax: @@ -965,22 +1102,20 @@ static int __trace_fprobe_create(int argc, const char *argv[]) * Type of args: * FETCHARG:TYPE : use TYPE instead of unsigned long. */ - struct trace_fprobe *tf = NULL; - int i, len, new_argc = 0, ret = 0; + struct trace_fprobe *tf __free(free_trace_fprobe) = NULL; + int i, new_argc = 0, ret = 0; bool is_return = false; - char *symbol = NULL; + char *symbol __free(kfree) = NULL; const char *event = NULL, *group = FPROBE_EVENT_SYSTEM; - const char **new_argv = NULL; - int maxactive = 0; + const char **new_argv __free(kfree) = NULL; char buf[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; char sbuf[KSYM_NAME_LEN]; char abuf[MAX_BTF_ARGS_LEN]; + char *dbuf __free(kfree) = NULL; bool is_tracepoint = false; + struct module *tp_mod __free(module_put) = NULL; struct tracepoint *tpoint = NULL; - struct traceprobe_parse_context ctx = { - .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE, - }; if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2) return -ECANCELED; @@ -990,35 +1125,13 @@ static int __trace_fprobe_create(int argc, const char *argv[]) group = TRACEPOINT_EVENT_SYSTEM; } - trace_probe_log_init("trace_fprobe", argc, argv); - - event = strchr(&argv[0][1], ':'); - if (event) - event++; - - if (isdigit(argv[0][1])) { - if (event) - len = event - &argv[0][1] - 1; - else - len = strlen(&argv[0][1]); - if (len > MAX_EVENT_NAME_LEN - 1) { - trace_probe_log_err(1, BAD_MAXACT); - goto parse_error; - } - memcpy(buf, &argv[0][1], len); - buf[len] = '\0'; - ret = kstrtouint(buf, 0, &maxactive); - if (ret || !maxactive) { + if (argv[0][1] != '\0') { + if (argv[0][1] != ':') { + trace_probe_log_set_index(0); trace_probe_log_err(1, BAD_MAXACT); - goto parse_error; - } - /* fprobe rethook instances are iterated over via a list. The - * maximum should stay reasonable. - */ - if (maxactive > RETHOOK_MAXACTIVE_MAX) { - trace_probe_log_err(1, MAXACT_TOO_BIG); - goto parse_error; + return -EINVAL; } + event = &argv[0][2]; } trace_probe_log_set_index(1); @@ -1026,20 +1139,14 @@ static int __trace_fprobe_create(int argc, const char *argv[]) /* a symbol(or tracepoint) must be specified */ ret = parse_symbol_and_return(argc, argv, &symbol, &is_return, is_tracepoint); if (ret < 0) - goto parse_error; - - if (!is_return && maxactive) { - trace_probe_log_set_index(0); - trace_probe_log_err(1, BAD_MAXACT_TYPE); - goto parse_error; - } + return -EINVAL; trace_probe_log_set_index(0); if (event) { ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) - goto parse_error; + return -EINVAL; } if (!event) { @@ -1055,64 +1162,83 @@ static int __trace_fprobe_create(int argc, const char *argv[]) } if (is_return) - ctx.flags |= TPARG_FL_RETURN; + ctx->flags |= TPARG_FL_RETURN; else - ctx.flags |= TPARG_FL_FENTRY; + ctx->flags |= TPARG_FL_FENTRY; if (is_tracepoint) { - ctx.flags |= TPARG_FL_TPOINT; - tpoint = find_tracepoint(symbol); - if (!tpoint) { - trace_probe_log_set_index(1); - trace_probe_log_err(0, NO_TRACEPOINT); - goto parse_error; + ctx->flags |= TPARG_FL_TPOINT; + tpoint = find_tracepoint(symbol, &tp_mod); + /* lock module until register this tprobe. */ + if (tp_mod && !try_module_get(tp_mod)) { + tpoint = NULL; + tp_mod = NULL; } - ctx.funcname = kallsyms_lookup( + if (tpoint) { + ctx->funcname = kallsyms_lookup( (unsigned long)tpoint->probestub, NULL, NULL, NULL, sbuf); + } else if (IS_ENABLED(CONFIG_MODULES)) { + /* This *may* be loaded afterwards */ + tpoint = TRACEPOINT_STUB; + ctx->funcname = symbol; + } else { + trace_probe_log_set_index(1); + trace_probe_log_err(0, NO_TRACEPOINT); + return -EINVAL; + } } else - ctx.funcname = symbol; + ctx->funcname = symbol; argc -= 2; argv += 2; new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, - abuf, MAX_BTF_ARGS_LEN, &ctx); - if (IS_ERR(new_argv)) { - ret = PTR_ERR(new_argv); - new_argv = NULL; - goto out; - } + abuf, MAX_BTF_ARGS_LEN, ctx); + if (IS_ERR(new_argv)) + return PTR_ERR(new_argv); if (new_argv) { argc = new_argc; argv = new_argv; } + if (argc > MAX_TRACE_ARGS) + return -E2BIG; + + ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); + if (ret) + return ret; /* setup a probe */ - tf = alloc_trace_fprobe(group, event, symbol, tpoint, maxactive, + tf = alloc_trace_fprobe(group, event, symbol, tpoint, tp_mod, argc, is_return); if (IS_ERR(tf)) { ret = PTR_ERR(tf); /* This must return -ENOMEM, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM); - goto out; /* We know tf is not allocated */ + return ret; } - if (is_tracepoint) - tf->mod = __module_text_address( - (unsigned long)tf->tpoint->probestub); - /* parse arguments */ - for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + for (i = 0; i < argc; i++) { trace_probe_log_set_index(i + 2); - ctx.offset = 0; - ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], &ctx); + ctx->offset = 0; + ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], ctx); if (ret) - goto error; /* This can be -ENOMEM */ + return ret; /* This can be -ENOMEM */ + } + + if (is_return && tf->tp.entry_arg) { + tf->fp.entry_handler = trace_fprobe_entry_handler; + tf->fp.entry_data_size = traceprobe_get_entry_data_size(&tf->tp); + if (ALIGN(tf->fp.entry_data_size, sizeof(long)) > MAX_FPROBE_DATA_SIZE) { + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_EARGS); + return -E2BIG; + } } ret = traceprobe_set_print_fmt(&tf->tp, is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL); if (ret < 0) - goto error; + return ret; ret = register_trace_fprobe(tf); if (ret) { @@ -1123,26 +1249,32 @@ static int __trace_fprobe_create(int argc, const char *argv[]) trace_probe_log_err(0, BAD_PROBE_ADDR); else if (ret != -ENOMEM && ret != -EEXIST) trace_probe_log_err(0, FAIL_REG_PROBE); - goto error; + return -EINVAL; } -out: + /* 'tf' is successfully registered. To avoid freeing, assign NULL. */ + tf = NULL; + + return 0; +} + +static int trace_fprobe_create_cb(int argc, const char *argv[]) +{ + struct traceprobe_parse_context ctx = { + .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE, + }; + int ret; + + trace_probe_log_init("trace_fprobe", argc, argv); + ret = trace_fprobe_create_internal(argc, argv, &ctx); traceprobe_finish_parse(&ctx); trace_probe_log_clear(); - kfree(new_argv); - kfree(symbol); return ret; - -parse_error: - ret = -EINVAL; -error: - free_trace_fprobe(tf); - goto out; } static int trace_fprobe_create(const char *raw_command) { - return trace_probe_create(raw_command, __trace_fprobe_create); + return trace_probe_create(raw_command, trace_fprobe_create_cb); } static int trace_fprobe_release(struct dyn_event *ev) @@ -1164,8 +1296,6 @@ static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev) seq_putc(m, 't'); else seq_putc(m, 'f'); - if (trace_fprobe_is_return(tf) && tf->fp.nr_maxactive) - seq_printf(m, "%d", tf->fp.nr_maxactive); seq_printf(m, ":%s/%s", trace_probe_group_name(&tf->tp), trace_probe_name(&tf->tp)); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 9f1bfbe105e8..df56f9b76010 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -80,6 +80,7 @@ void ftrace_free_ftrace_ops(struct trace_array *tr) int ftrace_create_function_files(struct trace_array *tr, struct dentry *parent) { + int ret; /* * The top level array uses the "global_ops", and the files are * created on boot up. @@ -90,6 +91,12 @@ int ftrace_create_function_files(struct trace_array *tr, if (!tr->ops) return -EINVAL; + ret = allocate_fgraph_ops(tr, tr->ops); + if (ret) { + kfree(tr->ops); + return ret; + } + ftrace_create_filter_files(tr->ops, parent); return 0; @@ -99,6 +106,7 @@ void ftrace_destroy_function_files(struct trace_array *tr) { ftrace_destroy_filter_files(tr->ops); ftrace_free_ftrace_ops(tr); + free_fgraph_ops(tr); } static ftrace_func_t select_trace_function(u32 flags_val) @@ -168,6 +176,28 @@ static void function_trace_start(struct trace_array *tr) tracing_reset_online_cpus(&tr->array_buffer); } +/* fregs are guaranteed not to be NULL if HAVE_DYNAMIC_FTRACE_WITH_ARGS is set */ +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) && defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) +static __always_inline unsigned long +function_get_true_parent_ip(unsigned long parent_ip, struct ftrace_regs *fregs) +{ + unsigned long true_parent_ip; + int idx = 0; + + true_parent_ip = parent_ip; + if (unlikely(parent_ip == (unsigned long)&return_to_handler) && fregs) + true_parent_ip = ftrace_graph_ret_addr(current, &idx, parent_ip, + (unsigned long *)ftrace_regs_get_stack_pointer(fregs)); + return true_parent_ip; +} +#else +static __always_inline unsigned long +function_get_true_parent_ip(unsigned long parent_ip, struct ftrace_regs *fregs) +{ + return parent_ip; +} +#endif + static void function_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) @@ -176,7 +206,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, struct trace_array_cpu *data; unsigned int trace_ctx; int bit; - int cpu; if (unlikely(!tr->function_enabled)) return; @@ -185,10 +214,11 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; - trace_ctx = tracing_gen_ctx(); + parent_ip = function_get_true_parent_ip(parent_ip, fregs); - cpu = smp_processor_id(); - data = per_cpu_ptr(tr->array_buffer.data, cpu); + trace_ctx = tracing_gen_ctx_dec(); + + data = this_cpu_ptr(tr->array_buffer.data); if (!atomic_read(&data->disabled)) trace_function(tr, ip, parent_ip, trace_ctx); @@ -223,6 +253,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, long disabled; int cpu; unsigned int trace_ctx; + int skip = STACK_SKIP; if (unlikely(!tr->function_enabled)) return; @@ -232,6 +263,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, * recursive protection is performed. */ local_irq_save(flags); + parent_ip = function_get_true_parent_ip(parent_ip, fregs); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); @@ -239,7 +271,11 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, if (likely(disabled == 1)) { trace_ctx = tracing_gen_ctx_flags(flags); trace_function(tr, ip, parent_ip, trace_ctx); - __trace_stack(tr, trace_ctx, STACK_SKIP); +#ifdef CONFIG_UNWINDER_FRAME_POINTER + if (ftrace_pids_enabled(op)) + skip++; +#endif + __trace_stack(tr, trace_ctx, skip); } atomic_dec(&data->disabled); @@ -285,9 +321,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, struct trace_array *tr = op->private; struct trace_array_cpu *data; unsigned int trace_ctx; - unsigned long flags; int bit; - int cpu; if (unlikely(!tr->function_enabled)) return; @@ -296,8 +330,8 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; - cpu = smp_processor_id(); - data = per_cpu_ptr(tr->array_buffer.data, cpu); + parent_ip = function_get_true_parent_ip(parent_ip, fregs); + data = this_cpu_ptr(tr->array_buffer.data); if (atomic_read(&data->disabled)) goto out; @@ -308,12 +342,11 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, * TODO: think about a solution that is better than just hoping to be * lucky. */ - last_info = per_cpu_ptr(tr->last_func_repeats, cpu); + last_info = this_cpu_ptr(tr->last_func_repeats); if (is_repeat_check(tr, last_info, ip, parent_ip)) goto out; - local_save_flags(flags); - trace_ctx = tracing_gen_ctx_flags(flags); + trace_ctx = tracing_gen_ctx_dec(); process_repeats(tr, ip, parent_ip, last_info, trace_ctx); trace_function(tr, ip, parent_ip, trace_ctx); @@ -343,6 +376,7 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, * recursive protection is performed. */ local_irq_save(flags); + parent_ip = function_get_true_parent_ip(parent_ip, fregs); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index c35fbaab2a47..136c750b0b4d 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -31,7 +31,10 @@ struct fgraph_data { struct fgraph_cpu_data __percpu *cpu_data; /* Place to preserve last processed entry. */ - struct ftrace_graph_ent_entry ent; + union { + struct ftrace_graph_ent_entry ent; + struct fgraph_retaddr_ent_entry rent; + } ent; struct ftrace_graph_ret_entry ret; int failed; int cpu; @@ -64,6 +67,10 @@ static struct tracer_opt trace_opts[] = { /* Display function return value in hexadecimal format ? */ { TRACER_OPT(funcgraph-retval-hex, TRACE_GRAPH_PRINT_RETVAL_HEX) }, #endif +#ifdef CONFIG_FUNCTION_GRAPH_RETADDR + /* Display function return address ? */ + { TRACER_OPT(funcgraph-retaddr, TRACE_GRAPH_PRINT_RETADDR) }, +#endif /* Include sleep time (scheduled out) between entry and return */ { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) }, @@ -83,7 +90,10 @@ static struct tracer_flags tracer_flags = { .opts = trace_opts }; -static struct trace_array *graph_array; +static bool tracer_flags_is_set(u32 flags) +{ + return (tracer_flags.val & flags) == flags; +} /* * DURATION column is being also used to display IRQ signs, @@ -104,7 +114,6 @@ int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned int trace_ctx) { - struct trace_event_call *call = &event_funcgraph_entry; struct ring_buffer_event *event; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ftrace_graph_ent_entry *entry; @@ -115,12 +124,43 @@ int __trace_graph_entry(struct trace_array *tr, return 0; entry = ring_buffer_event_data(event); entry->graph_ent = *trace; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); return 1; } +#ifdef CONFIG_FUNCTION_GRAPH_RETADDR +int __trace_graph_retaddr_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned int trace_ctx, + unsigned long retaddr) +{ + struct ring_buffer_event *event; + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct fgraph_retaddr_ent_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RETADDR_ENT, + sizeof(*entry), trace_ctx); + if (!event) + return 0; + entry = ring_buffer_event_data(event); + entry->graph_ent.func = trace->func; + entry->graph_ent.depth = trace->depth; + entry->graph_ent.retaddr = retaddr; + trace_buffer_unlock_commit_nostack(buffer, event); + + return 1; +} +#else +int __trace_graph_retaddr_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned int trace_ctx, + unsigned long retaddr) +{ + return 1; +} +#endif + static inline int ftrace_graph_ignore_irqs(void) { if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT)) @@ -129,17 +169,25 @@ static inline int ftrace_graph_ignore_irqs(void) return in_hardirq(); } -int trace_graph_entry(struct ftrace_graph_ent *trace) +struct fgraph_times { + unsigned long long calltime; + unsigned long long sleeptime; /* may be optional! */ +}; + +int trace_graph_entry(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { - struct trace_array *tr = graph_array; + unsigned long *task_var = fgraph_get_task_var(gops); + struct trace_array *tr = gops->private; struct trace_array_cpu *data; - unsigned long flags; + struct fgraph_times *ftimes; unsigned int trace_ctx; long disabled; - int ret; + int ret = 0; int cpu; - if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) + if (*task_var & TRACE_GRAPH_NOTRACE) return 0; /* @@ -150,7 +198,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) * returning from the function. */ if (ftrace_graph_notrace_addr(trace->func)) { - trace_recursion_set(TRACE_GRAPH_NOTRACE_BIT); + *task_var |= TRACE_GRAPH_NOTRACE; /* * Need to return 1 to have the return called * that will clear the NOTRACE bit. @@ -161,12 +209,25 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) if (!ftrace_trace_task(tr)) return 0; - if (ftrace_graph_ignore_func(trace)) + if (ftrace_graph_ignore_func(gops, trace)) return 0; if (ftrace_graph_ignore_irqs()) return 0; + if (fgraph_sleep_time) { + /* Only need to record the calltime */ + ftimes = fgraph_reserve_data(gops->idx, sizeof(ftimes->calltime)); + } else { + ftimes = fgraph_reserve_data(gops->idx, sizeof(*ftimes)); + if (ftimes) + ftimes->sleeptime = current->ftrace_sleeptime; + } + if (!ftimes) + return 0; + + ftimes->calltime = trace_clock_local(); + /* * Stop here if tracing_threshold is set. We only write function return * events to the ring buffer. @@ -174,19 +235,21 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) if (tracing_thresh) return 1; - local_irq_save(flags); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { - trace_ctx = tracing_gen_ctx_flags(flags); - ret = __trace_graph_entry(tr, trace, trace_ctx); - } else { - ret = 0; + disabled = atomic_read(&data->disabled); + if (likely(!disabled)) { + trace_ctx = tracing_gen_ctx(); + if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && + tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR)) { + unsigned long retaddr = ftrace_graph_top_ret_addr(current); + ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr); + } else { + ret = __trace_graph_entry(tr, trace, trace_ctx); + } } - - atomic_dec(&data->disabled); - local_irq_restore(flags); + preempt_enable_notrace(); return ret; } @@ -203,12 +266,10 @@ __trace_graph_function(struct trace_array *tr, struct ftrace_graph_ret ret = { .func = ip, .depth = 0, - .calltime = time, - .rettime = time, }; __trace_graph_entry(tr, &ent, trace_ctx); - __trace_graph_return(tr, &ret, trace_ctx); + __trace_graph_return(tr, &ret, trace_ctx, time, time); } void @@ -220,10 +281,10 @@ trace_graph_function(struct trace_array *tr, } void __trace_graph_return(struct trace_array *tr, - struct ftrace_graph_ret *trace, - unsigned int trace_ctx) + struct ftrace_graph_ret *trace, + unsigned int trace_ctx, + u64 calltime, u64 rettime) { - struct trace_event_call *call = &event_funcgraph_exit; struct ring_buffer_event *event; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ftrace_graph_ret_entry *entry; @@ -234,82 +295,140 @@ void __trace_graph_return(struct trace_array *tr, return; entry = ring_buffer_event_data(event); entry->ret = *trace; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + entry->calltime = calltime; + entry->rettime = rettime; + trace_buffer_unlock_commit_nostack(buffer, event); +} + +static void handle_nosleeptime(struct ftrace_graph_ret *trace, + struct fgraph_times *ftimes, + int size) +{ + if (fgraph_sleep_time || size < sizeof(*ftimes)) + return; + + ftimes->calltime += current->ftrace_sleeptime - ftimes->sleeptime; } -void trace_graph_return(struct ftrace_graph_ret *trace) +void trace_graph_return(struct ftrace_graph_ret *trace, + struct fgraph_ops *gops, struct ftrace_regs *fregs) { - struct trace_array *tr = graph_array; + unsigned long *task_var = fgraph_get_task_var(gops); + struct trace_array *tr = gops->private; struct trace_array_cpu *data; - unsigned long flags; + struct fgraph_times *ftimes; unsigned int trace_ctx; + u64 calltime, rettime; long disabled; + int size; int cpu; - ftrace_graph_addr_finish(trace); + rettime = trace_clock_local(); - if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) { - trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT); + ftrace_graph_addr_finish(gops, trace); + + if (*task_var & TRACE_GRAPH_NOTRACE) { + *task_var &= ~TRACE_GRAPH_NOTRACE; return; } - local_irq_save(flags); + ftimes = fgraph_retrieve_data(gops->idx, &size); + if (!ftimes) + return; + + handle_nosleeptime(trace, ftimes, size); + + calltime = ftimes->calltime; + + preempt_disable_notrace(); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { - trace_ctx = tracing_gen_ctx_flags(flags); - __trace_graph_return(tr, trace, trace_ctx); + disabled = atomic_read(&data->disabled); + if (likely(!disabled)) { + trace_ctx = tracing_gen_ctx(); + __trace_graph_return(tr, trace, trace_ctx, calltime, rettime); } - atomic_dec(&data->disabled); - local_irq_restore(flags); + preempt_enable_notrace(); } -void set_graph_array(struct trace_array *tr) +static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { - graph_array = tr; + struct fgraph_times *ftimes; + int size; - /* Make graph_array visible before we start tracing */ - - smp_mb(); -} - -static void trace_graph_thresh_return(struct ftrace_graph_ret *trace) -{ - ftrace_graph_addr_finish(trace); + ftrace_graph_addr_finish(gops, trace); if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) { trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT); return; } + ftimes = fgraph_retrieve_data(gops->idx, &size); + if (!ftimes) + return; + + handle_nosleeptime(trace, ftimes, size); + if (tracing_thresh && - (trace->rettime - trace->calltime < tracing_thresh)) + (trace_clock_local() - ftimes->calltime < tracing_thresh)) return; else - trace_graph_return(trace); + trace_graph_return(trace, gops, fregs); } -static struct fgraph_ops funcgraph_thresh_ops = { - .entryfunc = &trace_graph_entry, - .retfunc = &trace_graph_thresh_return, -}; - static struct fgraph_ops funcgraph_ops = { .entryfunc = &trace_graph_entry, .retfunc = &trace_graph_return, }; +int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops) +{ + struct fgraph_ops *gops; + + gops = kzalloc(sizeof(*gops), GFP_KERNEL); + if (!gops) + return -ENOMEM; + + gops->entryfunc = &trace_graph_entry; + gops->retfunc = &trace_graph_return; + + tr->gops = gops; + gops->private = tr; + + fgraph_init_ops(&gops->ops, ops); + + return 0; +} + +void free_fgraph_ops(struct trace_array *tr) +{ + kfree(tr->gops); +} + +__init void init_array_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops) +{ + tr->gops = &funcgraph_ops; + funcgraph_ops.private = tr; + fgraph_init_ops(&tr->gops->ops, ops); +} + static int graph_trace_init(struct trace_array *tr) { int ret; - set_graph_array(tr); + tr->gops->entryfunc = trace_graph_entry; + if (tracing_thresh) - ret = register_ftrace_graph(&funcgraph_thresh_ops); + tr->gops->retfunc = trace_graph_thresh_return; else - ret = register_ftrace_graph(&funcgraph_ops); + tr->gops->retfunc = trace_graph_return; + + /* Make gops functions are visible before we start tracing */ + smp_mb(); + + ret = register_ftrace_graph(tr->gops); if (ret) return ret; tracing_start_cmdline_record(); @@ -320,10 +439,7 @@ static int graph_trace_init(struct trace_array *tr) static void graph_trace_reset(struct trace_array *tr) { tracing_stop_cmdline_record(); - if (tracing_thresh) - unregister_ftrace_graph(&funcgraph_thresh_ops); - else - unregister_ftrace_graph(&funcgraph_ops); + unregister_ftrace_graph(tr->gops); } static int graph_trace_update_thresh(struct trace_array *tr) @@ -434,7 +550,7 @@ get_return_for_leaf(struct trace_iterator *iter, * then we just reuse the data from before. */ if (data && data->failed) { - curr = &data->ent; + curr = &data->ent.ent; next = &data->ret; } else { @@ -464,7 +580,10 @@ get_return_for_leaf(struct trace_iterator *iter, * Save current and next entries for later reference * if the output fails. */ - data->ent = *curr; + if (unlikely(curr->ent.type == TRACE_GRAPH_RETADDR_ENT)) + data->ent.rent = *(struct fgraph_retaddr_ent_entry *)curr; + else + data->ent.ent = *curr; /* * If the next event is not a return type, then * we only care about what type it is. Otherwise we can @@ -521,6 +640,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, struct trace_seq *s = &iter->seq; struct trace_entry *ent = iter->ent; + addr += iter->tr->text_delta; + if (addr < (unsigned long)__irqentry_text_start || addr >= (unsigned long)__irqentry_text_end) return; @@ -626,52 +747,96 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, } #ifdef CONFIG_FUNCTION_GRAPH_RETVAL - #define __TRACE_GRAPH_PRINT_RETVAL TRACE_GRAPH_PRINT_RETVAL +#else +#define __TRACE_GRAPH_PRINT_RETVAL 0 +#endif -static void print_graph_retval(struct trace_seq *s, unsigned long retval, - bool leaf, void *func, bool hex_format) +#ifdef CONFIG_FUNCTION_GRAPH_RETADDR +#define __TRACE_GRAPH_PRINT_RETADDR TRACE_GRAPH_PRINT_RETADDR +static void print_graph_retaddr(struct trace_seq *s, struct fgraph_retaddr_ent_entry *entry, + u32 trace_flags, bool comment) +{ + if (comment) + trace_seq_puts(s, " /*"); + + trace_seq_puts(s, " <-"); + seq_print_ip_sym(s, entry->graph_ent.retaddr, trace_flags | TRACE_ITER_SYM_OFFSET); + + if (comment) + trace_seq_puts(s, " */"); +} +#else +#define __TRACE_GRAPH_PRINT_RETADDR 0 +#define print_graph_retaddr(_seq, _entry, _tflags, _comment) do { } while (0) +#endif + +#if defined(CONFIG_FUNCTION_GRAPH_RETVAL) || defined(CONFIG_FUNCTION_GRAPH_RETADDR) + +static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entry *entry, + struct ftrace_graph_ret *graph_ret, void *func, + u32 opt_flags, u32 trace_flags) { unsigned long err_code = 0; + unsigned long retval = 0; + bool print_retaddr = false; + bool print_retval = false; + bool hex_format = !!(opt_flags & TRACE_GRAPH_PRINT_RETVAL_HEX); + +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + retval = graph_ret->retval; + print_retval = !!(opt_flags & TRACE_GRAPH_PRINT_RETVAL); +#endif - if (retval == 0 || hex_format) - goto done; +#ifdef CONFIG_FUNCTION_GRAPH_RETADDR + print_retaddr = !!(opt_flags & TRACE_GRAPH_PRINT_RETADDR); +#endif - /* Check if the return value matches the negative format */ - if (IS_ENABLED(CONFIG_64BIT) && (retval & BIT(31)) && - (((u64)retval) >> 32) == 0) { - /* sign extension */ - err_code = (unsigned long)(s32)retval; - } else { - err_code = retval; + if (print_retval && retval && !hex_format) { + /* Check if the return value matches the negative format */ + if (IS_ENABLED(CONFIG_64BIT) && (retval & BIT(31)) && + (((u64)retval) >> 32) == 0) { + err_code = sign_extend64(retval, 31); + } else { + err_code = retval; + } + + if (!IS_ERR_VALUE(err_code)) + err_code = 0; } - if (!IS_ERR_VALUE(err_code)) - err_code = 0; + if (entry) { + if (entry->ent.type != TRACE_GRAPH_RETADDR_ENT) + print_retaddr = false; -done: - if (leaf) { - if (hex_format || (err_code == 0)) - trace_seq_printf(s, "%ps(); /* = 0x%lx */\n", - func, retval); + trace_seq_printf(s, "%ps();", func); + if (print_retval || print_retaddr) + trace_seq_puts(s, " /*"); else - trace_seq_printf(s, "%ps(); /* = %ld */\n", - func, err_code); + trace_seq_putc(s, '\n'); } else { + print_retaddr = false; + trace_seq_printf(s, "} /* %ps", func); + } + + if (print_retaddr) + print_graph_retaddr(s, (struct fgraph_retaddr_ent_entry *)entry, + trace_flags, false); + + if (print_retval) { if (hex_format || (err_code == 0)) - trace_seq_printf(s, "} /* %ps = 0x%lx */\n", - func, retval); + trace_seq_printf(s, " ret=0x%lx", retval); else - trace_seq_printf(s, "} /* %ps = %ld */\n", - func, err_code); + trace_seq_printf(s, " ret=%ld", err_code); } + + if (!entry || print_retval || print_retaddr) + trace_seq_puts(s, " */\n"); } #else -#define __TRACE_GRAPH_PRINT_RETVAL 0 - -#define print_graph_retval(_seq, _retval, _leaf, _func, _format) do {} while (0) +#define print_graph_retval(_seq, _ent, _ret, _func, _opt_flags, _trace_flags) do {} while (0) #endif @@ -687,12 +852,15 @@ print_graph_entry_leaf(struct trace_iterator *iter, struct ftrace_graph_ret *graph_ret; struct ftrace_graph_ent *call; unsigned long long duration; + unsigned long func; int cpu = iter->cpu; int i; graph_ret = &ret_entry->ret; call = &entry->graph_ent; - duration = graph_ret->rettime - graph_ret->calltime; + duration = ret_entry->rettime - ret_entry->calltime; + + func = call->func + iter->tr->text_delta; if (data) { struct fgraph_cpu_data *cpu_data; @@ -720,14 +888,15 @@ print_graph_entry_leaf(struct trace_iterator *iter, trace_seq_putc(s, ' '); /* - * Write out the function return value if the option function-retval is - * enabled. + * Write out the function return value or return address */ - if (flags & __TRACE_GRAPH_PRINT_RETVAL) - print_graph_retval(s, graph_ret->retval, true, (void *)call->func, - !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX)); - else - trace_seq_printf(s, "%ps();\n", (void *)call->func); + if (flags & (__TRACE_GRAPH_PRINT_RETVAL | __TRACE_GRAPH_PRINT_RETADDR)) { + print_graph_retval(s, entry, graph_ret, + (void *)graph_ret->func + iter->tr->text_delta, + flags, tr->trace_flags); + } else { + trace_seq_printf(s, "%ps();\n", (void *)func); + } print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET, cpu, iter->ent->pid, flags); @@ -743,6 +912,7 @@ print_graph_entry_nested(struct trace_iterator *iter, struct ftrace_graph_ent *call = &entry->graph_ent; struct fgraph_data *data = iter->private; struct trace_array *tr = iter->tr; + unsigned long func; int i; if (data) { @@ -765,7 +935,14 @@ print_graph_entry_nested(struct trace_iterator *iter, for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) trace_seq_putc(s, ' '); - trace_seq_printf(s, "%ps() {\n", (void *)call->func); + func = call->func + iter->tr->text_delta; + + trace_seq_printf(s, "%ps() {", (void *)func); + if (flags & __TRACE_GRAPH_PRINT_RETADDR && + entry->ent.type == TRACE_GRAPH_RETADDR_ENT) + print_graph_retaddr(s, (struct fgraph_retaddr_ent_entry *)entry, + tr->trace_flags, true); + trace_seq_putc(s, '\n'); if (trace_seq_has_overflowed(s)) return TRACE_TYPE_PARTIAL_LINE; @@ -840,6 +1017,8 @@ check_irq_entry(struct trace_iterator *iter, u32 flags, int *depth_irq; struct fgraph_data *data = iter->private; + addr += iter->tr->text_delta; + /* * If we are either displaying irqs, or we got called as * a graph event and private data does not exist, @@ -960,18 +1139,24 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, } static enum print_line_t -print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, +print_graph_return(struct ftrace_graph_ret_entry *retentry, struct trace_seq *s, struct trace_entry *ent, struct trace_iterator *iter, u32 flags) { - unsigned long long duration = trace->rettime - trace->calltime; + struct ftrace_graph_ret *trace = &retentry->ret; + u64 calltime = retentry->calltime; + u64 rettime = retentry->rettime; + unsigned long long duration = rettime - calltime; struct fgraph_data *data = iter->private; struct trace_array *tr = iter->tr; + unsigned long func; pid_t pid = ent->pid; int cpu = iter->cpu; int func_match = 1; int i; + func = trace->func + iter->tr->text_delta; + if (check_irq_return(iter, flags, trace->depth)) return TRACE_TYPE_HANDLED; @@ -1007,11 +1192,10 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, /* * Always write out the function name and its return value if the - * function-retval option is enabled. + * funcgraph-retval option is enabled. */ if (flags & __TRACE_GRAPH_PRINT_RETVAL) { - print_graph_retval(s, trace->retval, false, (void *)trace->func, - !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX)); + print_graph_retval(s, NULL, trace, (void *)func, flags, tr->trace_flags); } else { /* * If the return function does not have a matching entry, @@ -1023,7 +1207,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) trace_seq_puts(s, "}\n"); else - trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); + trace_seq_printf(s, "} /* %ps */\n", (void *)func); } /* Overrun */ @@ -1126,7 +1310,7 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) * to print out the missing entry which would never go out. */ if (data && data->failed) { - field = &data->ent; + field = &data->ent.ent; iter->cpu = data->cpu; ret = print_graph_entry(field, s, iter, flags); if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) { @@ -1150,10 +1334,20 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) saved = *field; return print_graph_entry(&saved, s, iter, flags); } +#ifdef CONFIG_FUNCTION_GRAPH_RETADDR + case TRACE_GRAPH_RETADDR_ENT: { + struct fgraph_retaddr_ent_entry saved; + struct fgraph_retaddr_ent_entry *rfield; + + trace_assign_type(rfield, entry); + saved = *rfield; + return print_graph_entry((struct ftrace_graph_ent_entry *)&saved, s, iter, flags); + } +#endif case TRACE_GRAPH_RET: { struct ftrace_graph_ret_entry *field; trace_assign_type(field, entry); - return print_graph_return(&field->ret, s, entry, iter, flags); + return print_graph_return(field, s, entry, iter, flags); } case TRACE_STACK: case TRACE_FN: @@ -1344,6 +1538,13 @@ static struct trace_event graph_trace_entry_event = { .funcs = &graph_functions, }; +#ifdef CONFIG_FUNCTION_GRAPH_RETADDR +static struct trace_event graph_trace_retaddr_entry_event = { + .type = TRACE_GRAPH_RETADDR_ENT, + .funcs = &graph_functions, +}; +#endif + static struct trace_event graph_trace_ret_event = { .type = TRACE_GRAPH_RET, .funcs = &graph_functions @@ -1362,6 +1563,7 @@ static struct tracer graph_trace __tracer_data = { .print_header = print_graph_headers, .flags = &tracer_flags, .set_flag = func_graph_set_flag, + .allow_instances = true, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_function_graph, #endif @@ -1429,6 +1631,13 @@ static __init int init_graph_trace(void) return 1; } +#ifdef CONFIG_FUNCTION_GRAPH_RETADDR + if (!register_trace_event(&graph_trace_retaddr_entry_event)) { + pr_warn("Warning: could not register graph trace retaddr events\n"); + return 1; + } +#endif + if (!register_trace_event(&graph_trace_ret_event)) { pr_warn("Warning: could not register graph trace events\n"); return 1; diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index b791524a6536..b65353ec2837 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -130,7 +130,6 @@ static bool hwlat_busy; static void trace_hwlat_sample(struct hwlat_sample *sample) { struct trace_array *tr = hwlat_trace; - struct trace_event_call *call = &event_hwlat; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct hwlat_entry *entry; @@ -148,8 +147,7 @@ static void trace_hwlat_sample(struct hwlat_sample *sample) entry->nmi_count = sample->nmi_count; entry->count = sample->count; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } /* Macros to encapsulate the time capturing infrastructure */ @@ -520,6 +518,8 @@ static void hwlat_hotplug_workfn(struct work_struct *dummy) if (!hwlat_busy || hwlat_data.thread_mode != MODE_PER_CPU) goto out_unlock; + if (!cpu_online(cpu)) + goto out_unlock; if (!cpumask_test_cpu(cpu, tr->tracing_cpumask)) goto out_unlock; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index ba37f768e2f2..7294ad676379 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -175,15 +175,18 @@ static int irqsoff_display_graph(struct trace_array *tr, int set) return start_irqsoff_tracer(irqsoff_trace, set); } -static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) +static int irqsoff_graph_entry(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; unsigned int trace_ctx; + u64 *calltime; int ret; - if (ftrace_graph_ignore_func(trace)) + if (ftrace_graph_ignore_func(gops, trace)) return 0; /* * Do not trace a function if it's filtered by set_graph_notrace. @@ -198,6 +201,12 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) if (!func_prolog_dec(tr, &data, &flags)) return 0; + calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime)); + if (!calltime) + return 0; + + *calltime = trace_clock_local(); + trace_ctx = tracing_gen_ctx_flags(flags); ret = __trace_graph_entry(tr, trace, trace_ctx); atomic_dec(&data->disabled); @@ -205,20 +214,30 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) return ret; } -static void irqsoff_graph_return(struct ftrace_graph_ret *trace) +static void irqsoff_graph_return(struct ftrace_graph_ret *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; unsigned int trace_ctx; + u64 *calltime; + u64 rettime; + int size; - ftrace_graph_addr_finish(trace); + ftrace_graph_addr_finish(gops, trace); if (!func_prolog_dec(tr, &data, &flags)) return; + rettime = trace_clock_local(); + calltime = fgraph_retrieve_data(gops->idx, &size); + if (!calltime) + return; + trace_ctx = tracing_gen_ctx_flags(flags); - __trace_graph_return(tr, trace, trace_ctx); + __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); atomic_dec(&data->disabled); } diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 59857a1ee44c..1e72d20b3c2f 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -96,22 +96,19 @@ static int kdb_ftdump(int argc, const char **argv) { int skip_entries = 0; long cpu_file; - char *cp; + int err; int cnt; int cpu; if (argc > 2) return KDB_ARGCOUNT; - if (argc) { - skip_entries = simple_strtol(argv[1], &cp, 0); - if (*cp) - skip_entries = 0; - } + if (argc && kstrtoint(argv[1], 0, &skip_entries)) + return KDB_BADINT; if (argc == 2) { - cpu_file = simple_strtol(argv[2], &cp, 0); - if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 || + err = kstrtol(argv[2], 0, &cpu_file); + if (err || cpu_file >= NR_CPUS || cpu_file < 0 || !cpu_online(cpu_file)) return KDB_BADINT; } else { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c4c6e0e0068b..d8d5f18a141a 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) "trace_kprobe: " fmt #include <linux/bpf-cgroup.h> +#include <linux/cleanup.h> #include <linux/security.h> #include <linux/module.h> #include <linux/uaccess.h> @@ -111,6 +112,7 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, return strncmp(module_name(mod), name, len) == 0 && name[len] == ':'; } +#ifdef CONFIG_MODULES static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) { char *p; @@ -129,6 +131,12 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) return ret; } +#else +static inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) +{ + return false; +} +#endif static bool trace_kprobe_is_busy(struct dyn_event *ev) { @@ -250,6 +258,9 @@ static void free_trace_kprobe(struct trace_kprobe *tk) } } +DEFINE_FREE(free_trace_kprobe, struct trace_kprobe *, + if (!IS_ERR_OR_NULL(_T)) free_trace_kprobe(_T)) + /* * Allocate new trace_probe and initialize it (including kprobes). */ @@ -261,7 +272,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, int maxactive, int nargs, bool is_return) { - struct trace_kprobe *tk; + struct trace_kprobe *tk __free(free_trace_kprobe) = NULL; int ret = -ENOMEM; tk = kzalloc(struct_size(tk, tp.args, nargs), GFP_KERNEL); @@ -270,12 +281,12 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, tk->nhit = alloc_percpu(unsigned long); if (!tk->nhit) - goto error; + return ERR_PTR(ret); if (symbol) { tk->symbol = kstrdup(symbol, GFP_KERNEL); if (!tk->symbol) - goto error; + return ERR_PTR(ret); tk->rp.kp.symbol_name = tk->symbol; tk->rp.kp.offset = offs; } else @@ -290,15 +301,12 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, INIT_HLIST_NODE(&tk->rp.kp.hlist); INIT_LIST_HEAD(&tk->rp.kp.list); - ret = trace_probe_init(&tk->tp, event, group, false); + ret = trace_probe_init(&tk->tp, event, group, false, nargs); if (ret < 0) - goto error; + return ERR_PTR(ret); dyn_event_init(&tk->devent, &trace_kprobe_ops); - return tk; -error: - free_trace_kprobe(tk); - return ERR_PTR(ret); + return_ptr(tk); } static struct trace_kprobe *find_trace_kprobe(const char *event, @@ -627,7 +635,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) struct trace_kprobe *old_tk; int ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); old_tk = find_trace_kprobe(trace_probe_name(&tk->tp), trace_probe_group_name(&tk->tp)); @@ -635,11 +643,9 @@ static int register_trace_kprobe(struct trace_kprobe *tk) if (trace_kprobe_is_return(tk) != trace_kprobe_is_return(old_tk)) { trace_probe_log_set_index(0); trace_probe_log_err(0, DIFF_PROBE_TYPE); - ret = -EEXIST; - } else { - ret = append_trace_kprobe(tk, old_tk); + return -EEXIST; } - goto end; + return append_trace_kprobe(tk, old_tk); } /* Register new event */ @@ -650,7 +656,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) trace_probe_log_err(0, EVENT_EXIST); } else pr_warn("Failed to register probe event(%d)\n", ret); - goto end; + return ret; } /* Register k*probe */ @@ -665,8 +671,22 @@ static int register_trace_kprobe(struct trace_kprobe *tk) else dyn_event_add(&tk->devent, trace_probe_event_call(&tk->tp)); -end: - mutex_unlock(&event_mutex); + return ret; +} + +#ifdef CONFIG_MODULES +static int validate_module_probe_symbol(const char *modname, const char *symbol); + +static int register_module_trace_kprobe(struct module *mod, struct trace_kprobe *tk) +{ + const char *p; + int ret = 0; + + p = strchr(trace_kprobe_symbol(tk), ':'); + if (p) + ret = validate_module_probe_symbol(module_name(mod), p + 1); + if (!ret) + ret = __register_trace_kprobe(tk); return ret; } @@ -683,27 +703,36 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, return NOTIFY_DONE; /* Update probes on coming module */ - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); for_each_trace_kprobe(tk, pos) { if (trace_kprobe_within_module(tk, mod)) { /* Don't need to check busy - this should have gone. */ __unregister_trace_kprobe(tk); - ret = __register_trace_kprobe(tk); + ret = register_module_trace_kprobe(mod, tk); if (ret) pr_warn("Failed to re-register probe %s on %s: %d\n", trace_probe_name(&tk->tp), module_name(mod), ret); } } - mutex_unlock(&event_mutex); return NOTIFY_DONE; } static struct notifier_block trace_kprobe_module_nb = { .notifier_call = trace_kprobe_module_callback, - .priority = 1 /* Invoked after kprobe module callback */ + .priority = 2 /* Invoked after kprobe and jump_label module callback */ }; +static int trace_kprobe_register_module_notifier(void) +{ + return register_module_notifier(&trace_kprobe_module_nb); +} +#else +static int trace_kprobe_register_module_notifier(void) +{ + return 0; +} +#endif /* CONFIG_MODULES */ static int count_symbols(void *data, unsigned long unused) { @@ -729,18 +758,86 @@ static int count_mod_symbols(void *data, const char *name, unsigned long unused) return 0; } -static unsigned int number_of_same_symbols(char *func_name) +static unsigned int number_of_same_symbols(const char *mod, const char *func_name) { struct sym_count_ctx ctx = { .count = 0, .name = func_name }; - kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count); + if (!mod) + kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count); - module_kallsyms_on_each_symbol(NULL, count_mod_symbols, &ctx); + module_kallsyms_on_each_symbol(mod, count_mod_symbols, &ctx); return ctx.count; } -static int __trace_kprobe_create(int argc, const char *argv[]) +static int validate_module_probe_symbol(const char *modname, const char *symbol) +{ + unsigned int count = number_of_same_symbols(modname, symbol); + + if (count > 1) { + /* + * Users should use ADDR to remove the ambiguity of + * using KSYM only. + */ + return -EADDRNOTAVAIL; + } else if (count == 0) { + /* + * We can return ENOENT earlier than when register the + * kprobe. + */ + return -ENOENT; + } + return 0; +} + +#ifdef CONFIG_MODULES +/* Return NULL if the module is not loaded or under unloading. */ +static struct module *try_module_get_by_name(const char *name) +{ + struct module *mod; + + rcu_read_lock_sched(); + mod = find_module(name); + if (mod && !try_module_get(mod)) + mod = NULL; + rcu_read_unlock_sched(); + + return mod; +} +#else +#define try_module_get_by_name(name) (NULL) +#endif + +static int validate_probe_symbol(char *symbol) +{ + struct module *mod = NULL; + char *modname = NULL, *p; + int ret = 0; + + p = strchr(symbol, ':'); + if (p) { + modname = symbol; + symbol = p + 1; + *p = '\0'; + mod = try_module_get_by_name(modname); + if (!mod) + goto out; + } + + ret = validate_module_probe_symbol(modname, symbol); +out: + if (p) + *p = ':'; + if (mod) + module_put(mod); + return ret; +} + +static int trace_kprobe_entry_handler(struct kretprobe_instance *ri, + struct pt_regs *regs); + +static int trace_kprobe_create_internal(int argc, const char *argv[], + struct traceprobe_parse_context *ctx) { /* * Argument syntax: @@ -766,11 +863,12 @@ static int __trace_kprobe_create(int argc, const char *argv[]) * Type of args: * FETCHARG:TYPE : use TYPE instead of unsigned long. */ - struct trace_kprobe *tk = NULL; + struct trace_kprobe *tk __free(free_trace_kprobe) = NULL; int i, len, new_argc = 0, ret = 0; bool is_return = false; - char *symbol = NULL, *tmp = NULL; - const char **new_argv = NULL; + char *symbol __free(kfree) = NULL; + char *tmp = NULL; + const char **new_argv __free(kfree) = NULL; const char *event = NULL, *group = KPROBE_EVENT_SYSTEM; enum probe_print_type ptype; int maxactive = 0; @@ -779,7 +877,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) char buf[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; char abuf[MAX_BTF_ARGS_LEN]; - struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL }; + char *dbuf __free(kfree) = NULL; switch (argv[0][0]) { case 'r': @@ -793,8 +891,6 @@ static int __trace_kprobe_create(int argc, const char *argv[]) if (argc < 2) return -ECANCELED; - trace_probe_log_init("trace_kprobe", argc, argv); - event = strchr(&argv[0][1], ':'); if (event) event++; @@ -802,7 +898,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) if (isdigit(argv[0][1])) { if (!is_return) { trace_probe_log_err(1, BAD_MAXACT_TYPE); - goto parse_error; + return -EINVAL; } if (event) len = event - &argv[0][1] - 1; @@ -810,21 +906,21 @@ static int __trace_kprobe_create(int argc, const char *argv[]) len = strlen(&argv[0][1]); if (len > MAX_EVENT_NAME_LEN - 1) { trace_probe_log_err(1, BAD_MAXACT); - goto parse_error; + return -EINVAL; } memcpy(buf, &argv[0][1], len); buf[len] = '\0'; ret = kstrtouint(buf, 0, &maxactive); if (ret || !maxactive) { trace_probe_log_err(1, BAD_MAXACT); - goto parse_error; + return -EINVAL; } /* kretprobes instances are iterated over via a list. The * maximum should stay reasonable. */ if (maxactive > KRETPROBE_MAXACTIVE_MAX) { trace_probe_log_err(1, MAXACT_TOO_BIG); - goto parse_error; + return -EINVAL; } } @@ -833,10 +929,9 @@ static int __trace_kprobe_create(int argc, const char *argv[]) if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) { trace_probe_log_set_index(1); /* Check whether uprobe event specified */ - if (strchr(argv[1], '/') && strchr(argv[1], ':')) { - ret = -ECANCELED; - goto error; - } + if (strchr(argv[1], '/') && strchr(argv[1], ':')) + return -ECANCELED; + /* a symbol specified */ symbol = kstrdup(argv[1], GFP_KERNEL); if (!symbol) @@ -849,7 +944,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) is_return = true; } else { trace_probe_log_err(tmp - symbol, BAD_ADDR_SUFFIX); - goto parse_error; + return -EINVAL; } } @@ -857,42 +952,25 @@ static int __trace_kprobe_create(int argc, const char *argv[]) ret = traceprobe_split_symbol_offset(symbol, &offset); if (ret || offset < 0 || offset > UINT_MAX) { trace_probe_log_err(0, BAD_PROBE_ADDR); - goto parse_error; + return -EINVAL; + } + ret = validate_probe_symbol(symbol); + if (ret) { + if (ret == -EADDRNOTAVAIL) + trace_probe_log_err(0, NON_UNIQ_SYMBOL); + else + trace_probe_log_err(0, BAD_PROBE_ADDR); + return -EINVAL; } if (is_return) - ctx.flags |= TPARG_FL_RETURN; + ctx->flags |= TPARG_FL_RETURN; ret = kprobe_on_func_entry(NULL, symbol, offset); if (ret == 0 && !is_return) - ctx.flags |= TPARG_FL_FENTRY; + ctx->flags |= TPARG_FL_FENTRY; /* Defer the ENOENT case until register kprobe */ if (ret == -EINVAL && is_return) { trace_probe_log_err(0, BAD_RETPROBE); - goto parse_error; - } - } - - if (symbol && !strchr(symbol, ':')) { - unsigned int count; - - count = number_of_same_symbols(symbol); - if (count > 1) { - /* - * Users should use ADDR to remove the ambiguity of - * using KSYM only. - */ - trace_probe_log_err(0, NON_UNIQ_SYMBOL); - ret = -EADDRNOTAVAIL; - - goto error; - } else if (count == 0) { - /* - * We can return ENOENT earlier than when register the - * kprobe. - */ - trace_probe_log_err(0, BAD_PROBE_ADDR); - ret = -ENOENT; - - goto error; + return -EINVAL; } } @@ -901,7 +979,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) - goto parse_error; + return ret; } if (!event) { @@ -917,18 +995,24 @@ static int __trace_kprobe_create(int argc, const char *argv[]) } argc -= 2; argv += 2; - ctx.funcname = symbol; + ctx->funcname = symbol; new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, - abuf, MAX_BTF_ARGS_LEN, &ctx); + abuf, MAX_BTF_ARGS_LEN, ctx); if (IS_ERR(new_argv)) { ret = PTR_ERR(new_argv); new_argv = NULL; - goto out; + return ret; } if (new_argv) { argc = new_argc; argv = new_argv; } + if (argc > MAX_TRACE_ARGS) + return -E2BIG; + + ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); + if (ret) + return ret; /* setup a probe */ tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, @@ -937,22 +1021,27 @@ static int __trace_kprobe_create(int argc, const char *argv[]) ret = PTR_ERR(tk); /* This must return -ENOMEM, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM); - goto out; /* We know tk is not allocated */ + return ret; /* We know tk is not allocated */ } /* parse arguments */ - for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + for (i = 0; i < argc; i++) { trace_probe_log_set_index(i + 2); - ctx.offset = 0; - ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], &ctx); + ctx->offset = 0; + ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], ctx); if (ret) - goto error; /* This can be -ENOMEM */ + return ret; /* This can be -ENOMEM */ + } + /* entry handler for kretprobe */ + if (is_return && tk->tp.entry_arg) { + tk->rp.entry_handler = trace_kprobe_entry_handler; + tk->rp.data_size = traceprobe_get_entry_data_size(&tk->tp); } ptype = is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; ret = traceprobe_set_print_fmt(&tk->tp, ptype); if (ret < 0) - goto error; + return ret; ret = register_trace_kprobe(tk); if (ret) { @@ -963,26 +1052,34 @@ static int __trace_kprobe_create(int argc, const char *argv[]) trace_probe_log_err(0, BAD_PROBE_ADDR); else if (ret != -ENOMEM && ret != -EEXIST) trace_probe_log_err(0, FAIL_REG_PROBE); - goto error; + return ret; } + /* + * Here, 'tk' has been registered to the list successfully, + * so we don't need to free it. + */ + tk = NULL; + + return 0; +} + +static int trace_kprobe_create_cb(int argc, const char *argv[]) +{ + struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL }; + int ret; + + trace_probe_log_init("trace_kprobe", argc, argv); + + ret = trace_kprobe_create_internal(argc, argv, &ctx); -out: traceprobe_finish_parse(&ctx); trace_probe_log_clear(); - kfree(new_argv); - kfree(symbol); return ret; - -parse_error: - ret = -EINVAL; -error: - free_trace_kprobe(tk); - goto out; } static int trace_kprobe_create(const char *raw_command) { - return trace_probe_create(raw_command, __trace_kprobe_create); + return trace_probe_create(raw_command, trace_kprobe_create_cb); } static int create_or_delete_trace_kprobe(const char *raw_command) @@ -1303,8 +1400,8 @@ static const struct file_operations kprobe_profile_ops = { /* Note that we don't verify it, since the code does not come from user space */ static int -process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, - void *base) +process_fetch_insn(struct fetch_insn *code, void *rec, void *edata, + void *dest, void *base) { struct pt_regs *regs = rec; unsigned long val; @@ -1329,6 +1426,9 @@ retry: case FETCH_OP_ARG: val = regs_get_kernel_argument(regs, code->param); break; + case FETCH_OP_EDATA: + val = *(unsigned long *)((unsigned long)edata + code->offset); + break; #endif case FETCH_NOP_SYMBOL: /* Ignore a place holder */ code++; @@ -1359,7 +1459,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, if (trace_trigger_soft_disabled(trace_file)) return; - dsize = __get_data_size(&tk->tp, regs); + dsize = __get_data_size(&tk->tp, regs, NULL); entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry) + tk->tp.size + dsize); @@ -1368,7 +1468,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, fbuffer.regs = regs; entry->ip = (unsigned long)tk->rp.kp.addr; - store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tk->tp, regs, NULL, sizeof(*entry), dsize); trace_event_buffer_commit(&fbuffer); } @@ -1384,6 +1484,31 @@ kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs) NOKPROBE_SYMBOL(kprobe_trace_func); /* Kretprobe handler */ + +static int trace_kprobe_entry_handler(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct kretprobe *rp = get_kretprobe(ri); + struct trace_kprobe *tk; + + /* + * There is a small chance that get_kretprobe(ri) returns NULL when + * the kretprobe is unregister on another CPU between kretprobe's + * trampoline_handler and this function. + */ + if (unlikely(!rp)) + return -ENOENT; + + tk = container_of(rp, struct trace_kprobe, rp); + + /* store argument values into ri->data as entry data */ + if (tk->tp.entry_arg) + store_trace_entry_data(ri->data, &tk->tp, regs); + + return 0; +} + + static nokprobe_inline void __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs, @@ -1399,7 +1524,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, if (trace_trigger_soft_disabled(trace_file)) return; - dsize = __get_data_size(&tk->tp, regs); + dsize = __get_data_size(&tk->tp, regs, ri->data); entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry) + tk->tp.size + dsize); @@ -1409,7 +1534,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, fbuffer.regs = regs; entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = get_kretprobe_retaddr(ri); - store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tk->tp, regs, ri->data, sizeof(*entry), dsize); trace_event_buffer_commit(&fbuffer); } @@ -1557,7 +1682,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) if (hlist_empty(head)) return 0; - dsize = __get_data_size(&tk->tp, regs); + dsize = __get_data_size(&tk->tp, regs, NULL); __size = sizeof(*entry) + tk->tp.size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); @@ -1568,7 +1693,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); - store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tk->tp, regs, NULL, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); return 0; @@ -1593,7 +1718,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, if (hlist_empty(head)) return; - dsize = __get_data_size(&tk->tp, regs); + dsize = __get_data_size(&tk->tp, regs, ri->data); __size = sizeof(*entry) + tk->tp.size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); @@ -1604,7 +1729,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = get_kretprobe_retaddr(ri); - store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tk->tp, regs, ri->data, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); } @@ -1770,26 +1895,14 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, bool is_return) { enum probe_print_type ptype; - struct trace_kprobe *tk; + struct trace_kprobe *tk __free(free_trace_kprobe) = NULL; int ret; char *event; if (func) { - unsigned int count; - - count = number_of_same_symbols(func); - if (count > 1) - /* - * Users should use addr to remove the ambiguity of - * using func only. - */ - return ERR_PTR(-EADDRNOTAVAIL); - else if (count == 0) - /* - * We can return ENOENT earlier than when register the - * kprobe. - */ - return ERR_PTR(-ENOENT); + ret = validate_probe_symbol(func); + if (ret) + return ERR_PTR(ret); } /* @@ -1813,19 +1926,14 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, ptype = trace_kprobe_is_return(tk) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; - if (traceprobe_set_print_fmt(&tk->tp, ptype) < 0) { - ret = -ENOMEM; - goto error; - } + if (traceprobe_set_print_fmt(&tk->tp, ptype) < 0) + return ERR_PTR(-ENOMEM); ret = __register_trace_kprobe(tk); if (ret < 0) - goto error; + return ERR_PTR(ret); - return trace_probe_event_call(&tk->tp); -error: - free_trace_kprobe(tk); - return ERR_PTR(ret); + return trace_probe_event_call(&(no_free_ptr(tk)->tp)); } void destroy_local_trace_kprobe(struct trace_event_call *event_call) @@ -1854,13 +1962,12 @@ static __init void enable_boot_kprobe_events(void) struct trace_kprobe *tk; struct dyn_event *pos; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); for_each_trace_kprobe(tk, pos) { list_for_each_entry(file, &tr->events, list) if (file->event_call == trace_probe_event_call(&tk->tp)) trace_event_enable_disable(file, 1, 0); } - mutex_unlock(&event_mutex); } static __init void setup_boot_kprobe_events(void) @@ -1897,7 +2004,7 @@ static __init int init_kprobe_trace_early(void) if (ret) return ret; - if (register_module_notifier(&trace_kprobe_module_nb)) + if (trace_kprobe_register_module_notifier()) return -EINVAL; return 0; @@ -1963,19 +2070,16 @@ static __init int kprobe_trace_self_tests_init(void) pr_info("Testing kprobe tracing: "); ret = create_or_delete_trace_kprobe("p:testprobe kprobe_trace_selftest_target $stack $stack0 +0($stack)"); - if (WARN_ON_ONCE(ret)) { - pr_warn("error on probing function entry.\n"); + if (WARN_ONCE(ret, "error on probing function entry.")) { warn++; } else { /* Enable trace point */ tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tk == NULL)) { - pr_warn("error on getting new probe.\n"); + if (WARN_ONCE(tk == NULL, "error on probing function entry.")) { warn++; } else { file = find_trace_probe_file(tk, top_trace_array()); - if (WARN_ON_ONCE(file == NULL)) { - pr_warn("error on getting probe file.\n"); + if (WARN_ONCE(file == NULL, "error on getting probe file.")) { warn++; } else enable_trace_kprobe( @@ -1984,19 +2088,16 @@ static __init int kprobe_trace_self_tests_init(void) } ret = create_or_delete_trace_kprobe("r:testprobe2 kprobe_trace_selftest_target $retval"); - if (WARN_ON_ONCE(ret)) { - pr_warn("error on probing function return.\n"); + if (WARN_ONCE(ret, "error on probing function return.")) { warn++; } else { /* Enable trace point */ tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tk == NULL)) { - pr_warn("error on getting 2nd new probe.\n"); + if (WARN_ONCE(tk == NULL, "error on getting 2nd new probe.")) { warn++; } else { file = find_trace_probe_file(tk, top_trace_array()); - if (WARN_ON_ONCE(file == NULL)) { - pr_warn("error on getting probe file.\n"); + if (WARN_ONCE(file == NULL, "error on getting probe file.")) { warn++; } else enable_trace_kprobe( @@ -2019,18 +2120,15 @@ static __init int kprobe_trace_self_tests_init(void) /* Disable trace points before removing it */ tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tk == NULL)) { - pr_warn("error on getting test probe.\n"); + if (WARN_ONCE(tk == NULL, "error on getting test probe.")) { warn++; } else { - if (trace_kprobe_nhit(tk) != 1) { - pr_warn("incorrect number of testprobe hits\n"); + if (WARN_ONCE(trace_kprobe_nhit(tk) != 1, + "incorrect number of testprobe hits.")) warn++; - } file = find_trace_probe_file(tk, top_trace_array()); - if (WARN_ON_ONCE(file == NULL)) { - pr_warn("error on getting probe file.\n"); + if (WARN_ONCE(file == NULL, "error on getting probe file.")) { warn++; } else disable_trace_kprobe( @@ -2038,18 +2136,15 @@ static __init int kprobe_trace_self_tests_init(void) } tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tk == NULL)) { - pr_warn("error on getting 2nd test probe.\n"); + if (WARN_ONCE(tk == NULL, "error on getting 2nd test probe.")) { warn++; } else { - if (trace_kprobe_nhit(tk) != 1) { - pr_warn("incorrect number of testprobe2 hits\n"); + if (WARN_ONCE(trace_kprobe_nhit(tk) != 1, + "incorrect number of testprobe2 hits.")) warn++; - } file = find_trace_probe_file(tk, top_trace_array()); - if (WARN_ON_ONCE(file == NULL)) { - pr_warn("error on getting probe file.\n"); + if (WARN_ONCE(file == NULL, "error on getting probe file.")) { warn++; } else disable_trace_kprobe( @@ -2057,23 +2152,15 @@ static __init int kprobe_trace_self_tests_init(void) } ret = create_or_delete_trace_kprobe("-:testprobe"); - if (WARN_ON_ONCE(ret)) { - pr_warn("error on deleting a probe.\n"); + if (WARN_ONCE(ret, "error on deleting a probe.")) warn++; - } ret = create_or_delete_trace_kprobe("-:testprobe2"); - if (WARN_ON_ONCE(ret)) { - pr_warn("error on deleting a probe.\n"); + if (WARN_ONCE(ret, "error on deleting a probe.")) warn++; - } + end: - ret = dyn_events_release_all(&trace_kprobe_ops); - if (WARN_ON_ONCE(ret)) { - pr_warn("error on cleaning up probes.\n"); - warn++; - } /* * Wait for the optimizer work to finish. Otherwise it might fiddle * with probes in already freed __init text. diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 64e77b513697..ba5858866b2f 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -294,7 +294,6 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_rw *rw) { - struct trace_event_call *call = &event_mmiotrace_rw; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_rw *entry; @@ -310,8 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->rw = *rw; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); } void mmio_trace_rw(struct mmiotrace_rw *rw) @@ -325,7 +323,6 @@ static void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_map *map) { - struct trace_event_call *call = &event_mmiotrace_map; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_map *entry; @@ -341,8 +338,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->map = *map; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); } void mmio_trace_mapping(struct mmiotrace_map *map) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index a8e28f9b9271..f3a2722ee4c0 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -228,6 +228,11 @@ static inline struct osnoise_variables *this_cpu_osn_var(void) return this_cpu_ptr(&per_cpu_osnoise_var); } +/* + * Protect the interface. + */ +static struct mutex interface_lock; + #ifdef CONFIG_TIMERLAT_TRACER /* * Runtime information for the timer mode. @@ -259,14 +264,20 @@ static inline void tlat_var_reset(void) { struct timerlat_variables *tlat_var; int cpu; + + /* Synchronize with the timerlat interfaces */ + mutex_lock(&interface_lock); /* * So far, all the values are initialized as 0, so * zeroing the structure is perfect. */ for_each_cpu(cpu, cpu_online_mask) { tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu); + if (tlat_var->kthread) + hrtimer_cancel(&tlat_var->timer); memset(tlat_var, 0, sizeof(*tlat_var)); } + mutex_unlock(&interface_lock); } #else /* CONFIG_TIMERLAT_TRACER */ #define tlat_var_reset() do {} while (0) @@ -332,11 +343,6 @@ struct timerlat_sample { #endif /* - * Protect the interface. - */ -static struct mutex interface_lock; - -/* * Tracer data. */ static struct osnoise_data { @@ -493,7 +499,6 @@ static void print_osnoise_headers(struct seq_file *s) static void __trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer) { - struct trace_event_call *call = &event_osnoise; struct ring_buffer_event *event; struct osnoise_entry *entry; @@ -511,8 +516,7 @@ __trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffe entry->softirq_count = sample->softirq_count; entry->thread_count = sample->thread_count; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } /* @@ -572,7 +576,6 @@ static void print_timerlat_headers(struct seq_file *s) static void __trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer) { - struct trace_event_call *call = &event_osnoise; struct ring_buffer_event *event; struct timerlat_entry *entry; @@ -585,8 +588,7 @@ __trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buf entry->context = sample->context; entry->timer_latency = sample->timer_latency; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } /* @@ -648,7 +650,6 @@ static void timerlat_save_stack(int skip) static void __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, unsigned int size) { - struct trace_event_call *call = &event_osnoise; struct ring_buffer_event *event; struct stack_entry *entry; @@ -662,8 +663,7 @@ __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, u memcpy(&entry->caller, fstack->calls, size); entry->size = fstack->nr_entries; - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit_nostack(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } /* @@ -1229,6 +1229,8 @@ static void trace_sched_migrate_callback(void *data, struct task_struct *p, int } } +static bool monitor_enabled; + static int register_migration_monitor(void) { int ret = 0; @@ -1237,16 +1239,25 @@ static int register_migration_monitor(void) * Timerlat thread migration check is only required when running timerlat in user-space. * Thus, enable callback only if timerlat is set with no workload. */ - if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) + if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) { + if (WARN_ON_ONCE(monitor_enabled)) + return 0; + ret = register_trace_sched_migrate_task(trace_sched_migrate_callback, NULL); + if (!ret) + monitor_enabled = true; + } return ret; } static void unregister_migration_monitor(void) { - if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) - unregister_trace_sched_migrate_task(trace_sched_migrate_callback, NULL); + if (!monitor_enabled) + return; + + unregister_trace_sched_migrate_task(trace_sched_migrate_callback, NULL); + monitor_enabled = false; } #else static int register_migration_monitor(void) @@ -1444,9 +1455,9 @@ static int run_osnoise(void) save_osn_sample_stats(osn_var, &s); /* - * if threshold is 0, use the default value of 5 us. + * if threshold is 0, use the default value of 1 us. */ - threshold = tracing_thresh ? : 5000; + threshold = tracing_thresh ? : 1000; /* * Apply PREEMPT and IRQ disabled options. @@ -1535,7 +1546,7 @@ static int run_osnoise(void) * This will eventually cause unwarranted noise as PREEMPT_RCU * will force preemption as the means of ending the current * grace period. We avoid this problem by calling - * rcu_momentary_dyntick_idle(), which performs a zero duration + * rcu_momentary_eqs(), which performs a zero duration * EQS allowing PREEMPT_RCU to end the current grace period. * This call shouldn't be wrapped inside an RCU critical * section. @@ -1547,7 +1558,7 @@ static int run_osnoise(void) if (!disable_irq) local_irq_disable(); - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); if (!disable_irq) local_irq_enable(); @@ -1612,6 +1623,7 @@ out: static struct cpumask osnoise_cpumask; static struct cpumask save_cpumask; +static struct cpumask kthread_cpumask; /* * osnoise_sleep - sleep until the next period @@ -1675,6 +1687,7 @@ static inline int osnoise_migration_pending(void) */ mutex_lock(&interface_lock); this_cpu_osn_var()->kthread = NULL; + cpumask_clear_cpu(smp_processor_id(), &kthread_cpumask); mutex_unlock(&interface_lock); return 1; @@ -1945,11 +1958,12 @@ static void stop_kthread(unsigned int cpu) { struct task_struct *kthread; - kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; + kthread = xchg_relaxed(&(per_cpu(per_cpu_osnoise_var, cpu).kthread), NULL); if (kthread) { - if (test_bit(OSN_WORKLOAD, &osnoise_options)) { + if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask) && + !WARN_ON(!test_bit(OSN_WORKLOAD, &osnoise_options))) { kthread_stop(kthread); - } else { + } else if (!WARN_ON(test_bit(OSN_WORKLOAD, &osnoise_options))) { /* * This is a user thread waiting on the timerlat_fd. We need * to close all users, and the best way to guarantee this is @@ -1958,7 +1972,6 @@ static void stop_kthread(unsigned int cpu) kill_pid(kthread->thread_pid, SIGKILL, 1); put_task_struct(kthread); } - per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; } else { /* if no workload, just return */ if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { @@ -1967,7 +1980,6 @@ static void stop_kthread(unsigned int cpu) */ per_cpu(per_cpu_osnoise_var, cpu).sampling = false; barrier(); - return; } } } @@ -1999,6 +2011,10 @@ static int start_kthread(unsigned int cpu) void *main = osnoise_main; char comm[24]; + /* Do not start a new thread if it is already running */ + if (per_cpu(per_cpu_osnoise_var, cpu).kthread) + return 0; + if (timerlat_enabled()) { snprintf(comm, 24, "timerlat/%d", cpu); main = timerlat_main; @@ -2021,6 +2037,7 @@ static int start_kthread(unsigned int cpu) } per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread; + cpumask_set_cpu(cpu, &kthread_cpumask); return 0; } @@ -2048,8 +2065,15 @@ static int start_per_cpu_kthreads(void) */ cpumask_and(current_mask, cpu_online_mask, &osnoise_cpumask); - for_each_possible_cpu(cpu) - per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + for_each_possible_cpu(cpu) { + if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask)) { + struct task_struct *kthread; + + kthread = xchg_relaxed(&(per_cpu(per_cpu_osnoise_var, cpu).kthread), NULL); + if (!WARN_ON(!kthread)) + kthread_stop(kthread); + } + } for_each_cpu(cpu, current_mask) { retval = start_kthread(cpu); @@ -2070,24 +2094,21 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy) { unsigned int cpu = smp_processor_id(); - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (!osnoise_has_registered_instances()) - goto out_unlock_trace; + return; - mutex_lock(&interface_lock); - cpus_read_lock(); + guard(mutex)(&interface_lock); + guard(cpus_read_lock)(); + + if (!cpu_online(cpu)) + return; if (!cpumask_test_cpu(cpu, &osnoise_cpumask)) - goto out_unlock; + return; start_kthread(cpu); - -out_unlock: - cpus_read_unlock(); - mutex_unlock(&interface_lock); -out_unlock_trace: - mutex_unlock(&trace_types_lock); } static DECLARE_WORK(osnoise_hotplug_work, osnoise_hotplug_workfn); @@ -2285,31 +2306,22 @@ static ssize_t osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { - char *mask_str; + char *mask_str __free(kfree) = NULL; int len; - mutex_lock(&interface_lock); + guard(mutex)(&interface_lock); len = snprintf(NULL, 0, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)) + 1; mask_str = kmalloc(len, GFP_KERNEL); - if (!mask_str) { - count = -ENOMEM; - goto out_unlock; - } + if (!mask_str) + return -ENOMEM; len = snprintf(mask_str, len, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)); - if (len >= count) { - count = -EINVAL; - goto out_free; - } + if (len >= count) + return -EINVAL; count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); -out_free: - kfree(mask_str); -out_unlock: - mutex_unlock(&interface_lock); - return count; } @@ -2579,7 +2591,8 @@ static int timerlat_fd_release(struct inode *inode, struct file *file) osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu); - hrtimer_cancel(&tlat_var->timer); + if (tlat_var->kthread) + hrtimer_cancel(&tlat_var->timer); memset(tlat_var, 0, sizeof(*tlat_var)); osn_var->sampling = 0; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index d8b302d01083..03d56f711ad1 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -317,10 +317,14 @@ EXPORT_SYMBOL(trace_raw_output_prep); void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...) { + struct trace_seq *s = &iter->seq; va_list ap; + if (ignore_event(iter)) + return; + va_start(ap, fmt); - trace_check_vprintf(iter, trace_event_format(iter, fmt), ap); + trace_seq_vprintf(s, trace_event_format(iter, fmt), ap); va_end(ap); } EXPORT_SYMBOL(trace_event_printf); @@ -460,20 +464,31 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) (entry->flags & TRACE_FLAG_IRQS_OFF && bh_off) ? 'D' : (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : bh_off ? 'b' : - (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.'; - switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | + switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED)) { + case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED: + need_resched = 'B'; + break; case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED: need_resched = 'N'; break; + case TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED: + need_resched = 'L'; + break; + case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY: + need_resched = 'b'; + break; case TRACE_FLAG_NEED_RESCHED: need_resched = 'n'; break; case TRACE_FLAG_PREEMPT_RESCHED: need_resched = 'p'; break; + case TRACE_FLAG_NEED_RESCHED_LAZY: + need_resched = 'l'; + break; default: need_resched = '.'; break; @@ -990,8 +1005,11 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, } static void print_fn_trace(struct trace_seq *s, unsigned long ip, - unsigned long parent_ip, int flags) + unsigned long parent_ip, long delta, int flags) { + ip += delta; + parent_ip += delta; + seq_print_ip_sym(s, ip, flags); if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) { @@ -1009,7 +1027,7 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - print_fn_trace(s, field->ip, field->parent_ip, flags); + print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags); trace_seq_putc(s, '\n'); return trace_handle_return(s); @@ -1230,6 +1248,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, struct trace_seq *s = &iter->seq; unsigned long *p; unsigned long *end; + long delta = iter->tr->text_delta; trace_assign_type(field, iter->ent); end = (unsigned long *)((long)iter->ent + iter->ent_size); @@ -1242,7 +1261,11 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, break; trace_seq_puts(s, " => "); - seq_print_ip_sym(s, *p, flags); + if ((*p) == FTRACE_TRAMPOLINE_MARKER) { + trace_seq_puts(s, "[FTRACE TRAMPOLINE]\n"); + continue; + } + seq_print_ip_sym(s, (*p) + delta, flags); trace_seq_putc(s, '\n'); } @@ -1587,10 +1610,13 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter, { struct print_entry *field; struct trace_seq *s = &iter->seq; + unsigned long ip; trace_assign_type(field, iter->ent); - seq_print_ip_sym(s, field->ip, flags); + ip = field->ip + iter->tr->text_delta; + + seq_print_ip_sym(s, ip, flags); trace_seq_printf(s, ": %s", field->buf); return trace_handle_return(s); @@ -1674,7 +1700,7 @@ trace_func_repeats_print(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - print_fn_trace(s, field->ip, field->parent_ip, flags); + print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags); trace_seq_printf(s, " (repeats: %u, last_ts:", field->count); trace_print_time(s, iter, iter->ts - FUNC_REPEATS_GET_DELTA_TS(field)); diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index e37446f7916e..0c42b15c3800 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/ftrace.h> #include <linux/kprobes.h> +#include <linux/hardirq.h> #include "trace.h" #define CREATE_TRACE_POINTS @@ -20,13 +21,29 @@ * tooling: these calls will only happen with RCU enabled, which can * use a regular tracepoint. * - * On older architectures, use the rcuidle tracing methods (which - * aren't NMI-safe - so exclude NMI contexts): + * On older architectures, RCU may not be watching in idle. In that + * case, wake up RCU to watch while calling the tracepoint. These + * aren't NMI-safe - so exclude NMI contexts: */ #ifdef CONFIG_ARCH_WANTS_NO_INSTR -#define trace(point) trace_##point +#define trace(point, args) trace_##point(args) #else -#define trace(point) if (!in_nmi()) trace_##point##_rcuidle +#define trace(point, args) \ + do { \ + if (trace_##point##_enabled()) { \ + bool exit_rcu = false; \ + if (in_nmi()) \ + break; \ + if (!IS_ENABLED(CONFIG_TINY_RCU) && \ + is_idle_task(current)) { \ + ct_irq_enter(); \ + exit_rcu = true; \ + } \ + trace_##point(args); \ + if (exit_rcu) \ + ct_irq_exit(); \ + } \ + } while (0) #endif #ifdef CONFIG_TRACE_IRQFLAGS @@ -42,7 +59,7 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu); void trace_hardirqs_on_prepare(void) { if (this_cpu_read(tracing_irq_cpu)) { - trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1); + trace(irq_enable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1)); tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); this_cpu_write(tracing_irq_cpu, 0); } @@ -53,7 +70,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_prepare); void trace_hardirqs_on(void) { if (this_cpu_read(tracing_irq_cpu)) { - trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1); + trace(irq_enable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1)); tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); this_cpu_write(tracing_irq_cpu, 0); } @@ -75,7 +92,7 @@ void trace_hardirqs_off_finish(void) if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); - trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1); + trace(irq_disable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1)); } } @@ -89,7 +106,7 @@ void trace_hardirqs_off(void) if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); - trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1); + trace(irq_disable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1)); } } EXPORT_SYMBOL(trace_hardirqs_off); @@ -100,13 +117,13 @@ NOKPROBE_SYMBOL(trace_hardirqs_off); void trace_preempt_on(unsigned long a0, unsigned long a1) { - trace(preempt_enable)(a0, a1); + trace(preempt_enable, TP_ARGS(a0, a1)); tracer_preempt_on(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { - trace(preempt_disable)(a0, a1); + trace(preempt_disable, TP_ARGS(a0, a1)); tracer_preempt_off(a0, a1); } #endif diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 34289f9c6707..8f58ee1e8858 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) "trace_probe: " fmt #include <linux/bpf.h> +#include <linux/fs.h> #include "trace_btf.h" #include "trace_probe.h" @@ -275,7 +276,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, } trace_probe_log_err(offset, NO_EVENT_NAME); return -EINVAL; - } else if (len > MAX_EVENT_NAME_LEN) { + } else if (len >= MAX_EVENT_NAME_LEN) { trace_probe_log_err(offset, EVENT_TOO_LONG); return -EINVAL; } @@ -553,6 +554,10 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type, anon_offs = 0; field = btf_find_struct_member(ctx->btf, type, fieldname, &anon_offs); + if (IS_ERR(field)) { + trace_probe_log_err(ctx->offset, BAD_BTF_TID); + return PTR_ERR(field); + } if (!field) { trace_probe_log_err(ctx->offset, NO_BTF_FIELD); return -ENOENT; @@ -594,6 +599,8 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type, return 0; } +static int __store_entry_arg(struct trace_probe *tp, int argnum); + static int parse_btf_arg(char *varname, struct fetch_insn **pcode, struct fetch_insn *end, struct traceprobe_parse_context *ctx) @@ -618,11 +625,7 @@ static int parse_btf_arg(char *varname, return -EOPNOTSUPP; } - if (ctx->flags & TPARG_FL_RETURN) { - if (strcmp(varname, "$retval") != 0) { - trace_probe_log_err(ctx->offset, NO_BTFARG); - return -ENOENT; - } + if (ctx->flags & TPARG_FL_RETURN && !strcmp(varname, "$retval")) { code->op = FETCH_OP_RETVAL; /* Check whether the function return type is not void */ if (query_btf_context(ctx) == 0) { @@ -654,11 +657,21 @@ static int parse_btf_arg(char *varname, const char *name = btf_name_by_offset(ctx->btf, params[i].name_off); if (name && !strcmp(name, varname)) { - code->op = FETCH_OP_ARG; - if (ctx->flags & TPARG_FL_TPOINT) - code->param = i + 1; - else - code->param = i; + if (tparg_is_function_entry(ctx->flags)) { + code->op = FETCH_OP_ARG; + if (ctx->flags & TPARG_FL_TPOINT) + code->param = i + 1; + else + code->param = i; + } else if (tparg_is_function_return(ctx->flags)) { + code->op = FETCH_OP_EDATA; + ret = __store_entry_arg(ctx->tp, i); + if (ret < 0) { + /* internal error */ + return ret; + } + code->offset = ret; + } tid = params[i].type; goto found; } @@ -755,6 +768,110 @@ static int check_prepare_btf_string_fetch(char *typename, #endif +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + +static int __store_entry_arg(struct trace_probe *tp, int argnum) +{ + struct probe_entry_arg *earg = tp->entry_arg; + bool match = false; + int i, offset; + + if (!earg) { + earg = kzalloc(sizeof(*tp->entry_arg), GFP_KERNEL); + if (!earg) + return -ENOMEM; + earg->size = 2 * tp->nr_args + 1; + earg->code = kcalloc(earg->size, sizeof(struct fetch_insn), + GFP_KERNEL); + if (!earg->code) { + kfree(earg); + return -ENOMEM; + } + /* Fill the code buffer with 'end' to simplify it */ + for (i = 0; i < earg->size; i++) + earg->code[i].op = FETCH_OP_END; + tp->entry_arg = earg; + } + + offset = 0; + for (i = 0; i < earg->size - 1; i++) { + switch (earg->code[i].op) { + case FETCH_OP_END: + earg->code[i].op = FETCH_OP_ARG; + earg->code[i].param = argnum; + earg->code[i + 1].op = FETCH_OP_ST_EDATA; + earg->code[i + 1].offset = offset; + return offset; + case FETCH_OP_ARG: + match = (earg->code[i].param == argnum); + break; + case FETCH_OP_ST_EDATA: + offset = earg->code[i].offset; + if (match) + return offset; + offset += sizeof(unsigned long); + break; + default: + break; + } + } + return -ENOSPC; +} + +int traceprobe_get_entry_data_size(struct trace_probe *tp) +{ + struct probe_entry_arg *earg = tp->entry_arg; + int i, size = 0; + + if (!earg) + return 0; + + for (i = 0; i < earg->size; i++) { + switch (earg->code[i].op) { + case FETCH_OP_END: + goto out; + case FETCH_OP_ST_EDATA: + size = earg->code[i].offset + sizeof(unsigned long); + break; + default: + break; + } + } +out: + return size; +} + +void store_trace_entry_data(void *edata, struct trace_probe *tp, struct pt_regs *regs) +{ + struct probe_entry_arg *earg = tp->entry_arg; + unsigned long val = 0; + int i; + + if (!earg) + return; + + for (i = 0; i < earg->size; i++) { + struct fetch_insn *code = &earg->code[i]; + + switch (code->op) { + case FETCH_OP_ARG: + val = regs_get_kernel_argument(regs, code->param); + break; + case FETCH_OP_ST_EDATA: + *(unsigned long *)((unsigned long)edata + code->offset) = val; + break; + case FETCH_OP_END: + goto end; + default: + break; + } + } +end: + return; +} +NOKPROBE_SYMBOL(store_trace_entry_data) +#endif + #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) /* Parse $vars. @orig_arg points '$', which syncs to @ctx->offset */ @@ -830,7 +947,7 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t, #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API len = str_has_prefix(arg, "arg"); - if (len && tparg_is_function_entry(ctx->flags)) { + if (len) { ret = kstrtoul(arg + len, 10, ¶m); if (ret) goto inval; @@ -839,15 +956,29 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t, err = TP_ERR_BAD_ARG_NUM; goto inval; } + param--; /* argN starts from 1, but internal arg[N] starts from 0 */ - code->op = FETCH_OP_ARG; - code->param = (unsigned int)param - 1; - /* - * The tracepoint probe will probe a stub function, and the - * first parameter of the stub is a dummy and should be ignored. - */ - if (ctx->flags & TPARG_FL_TPOINT) - code->param++; + if (tparg_is_function_entry(ctx->flags)) { + code->op = FETCH_OP_ARG; + code->param = (unsigned int)param; + /* + * The tracepoint probe will probe a stub function, and the + * first parameter of the stub is a dummy and should be ignored. + */ + if (ctx->flags & TPARG_FL_TPOINT) + code->param++; + } else if (tparg_is_function_return(ctx->flags)) { + /* function entry argument access from return probe */ + ret = __store_entry_arg(ctx->tp, param); + if (ret < 0) /* This error should be an internal error */ + return ret; + + code->op = FETCH_OP_EDATA; + code->offset = ret; + } else { + err = TP_ERR_NOFENTRY_ARGS; + goto inval; + } return 0; } #endif @@ -1037,7 +1168,8 @@ parse_probe_arg(char *arg, const struct fetch_type *type, break; default: if (isalpha(arg[0]) || arg[0] == '_') { /* BTF variable */ - if (!tparg_is_function_entry(ctx->flags)) { + if (!tparg_is_function_entry(ctx->flags) && + !tparg_is_function_return(ctx->flags)) { trace_probe_log_err(ctx->offset, NOSUP_BTFARG); return -EINVAL; } @@ -1053,8 +1185,6 @@ parse_probe_arg(char *arg, const struct fetch_type *type, return ret; } -#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) - /* Bitfield type needs to be parsed into a fetch function */ static int __parse_bitfield_probe_arg(const char *bf, const struct fetch_type *t, @@ -1090,67 +1220,45 @@ static int __parse_bitfield_probe_arg(const char *bf, return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; } -/* String length checking wrapper */ -static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, - struct probe_arg *parg, - struct traceprobe_parse_context *ctx) +/* Split type part from @arg and return it. */ +static char *parse_probe_arg_type(char *arg, struct probe_arg *parg, + struct traceprobe_parse_context *ctx) { - struct fetch_insn *code, *scode, *tmp = NULL; - char *t, *t2, *t3; - int ret, len; - char *arg; + char *t = NULL, *t2, *t3; + int offs; - arg = kstrdup(argv, GFP_KERNEL); - if (!arg) - return -ENOMEM; - - ret = -EINVAL; - len = strlen(arg); - if (len > MAX_ARGSTR_LEN) { - trace_probe_log_err(ctx->offset, ARG_TOO_LONG); - goto out; - } else if (len == 0) { - trace_probe_log_err(ctx->offset, NO_ARG_BODY); - goto out; - } - - ret = -ENOMEM; - parg->comm = kstrdup(arg, GFP_KERNEL); - if (!parg->comm) - goto out; - - ret = -EINVAL; t = strchr(arg, ':'); if (t) { - *t = '\0'; - t2 = strchr(++t, '['); + *t++ = '\0'; + t2 = strchr(t, '['); if (t2) { *t2++ = '\0'; t3 = strchr(t2, ']'); if (!t3) { - int offs = t2 + strlen(t2) - arg; + offs = t2 + strlen(t2) - arg; trace_probe_log_err(ctx->offset + offs, ARRAY_NO_CLOSE); - goto out; + return ERR_PTR(-EINVAL); } else if (t3[1] != '\0') { trace_probe_log_err(ctx->offset + t3 + 1 - arg, BAD_ARRAY_SUFFIX); - goto out; + return ERR_PTR(-EINVAL); } *t3 = '\0'; if (kstrtouint(t2, 0, &parg->count) || !parg->count) { trace_probe_log_err(ctx->offset + t2 - arg, BAD_ARRAY_NUM); - goto out; + return ERR_PTR(-EINVAL); } if (parg->count > MAX_ARRAY_LEN) { trace_probe_log_err(ctx->offset + t2 - arg, ARRAY_TOO_BIG); - goto out; + return ERR_PTR(-EINVAL); } } } + offs = t ? t - arg : 0; /* * Since $comm and immediate string can not be dereferenced, @@ -1161,74 +1269,52 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, strncmp(arg, "\\\"", 2) == 0)) { /* The type of $comm must be "string", and not an array type. */ if (parg->count || (t && strcmp(t, "string"))) { - trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), - NEED_STRING_TYPE); - goto out; + trace_probe_log_err(ctx->offset + offs, NEED_STRING_TYPE); + return ERR_PTR(-EINVAL); } parg->type = find_fetch_type("string", ctx->flags); } else parg->type = find_fetch_type(t, ctx->flags); + if (!parg->type) { - trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_TYPE); - goto out; + trace_probe_log_err(ctx->offset + offs, BAD_TYPE); + return ERR_PTR(-EINVAL); } - code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL); - if (!code) - goto out; - code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; - - ctx->last_type = NULL; - ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], - ctx); - if (ret) - goto fail; - - /* Update storing type if BTF is available */ - if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) && - ctx->last_type) { - if (!t) { - parg->type = find_fetch_type_from_btf_type(ctx); - } else if (strstr(t, "string")) { - ret = check_prepare_btf_string_fetch(t, &code, ctx); - if (ret) - goto fail; - } - } - parg->offset = *size; - *size += parg->type->size * (parg->count ?: 1); + return t; +} - if (parg->count) { - len = strlen(parg->type->fmttype) + 6; - parg->fmt = kmalloc(len, GFP_KERNEL); - if (!parg->fmt) { - ret = -ENOMEM; - goto out; - } - snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype, - parg->count); - } +/* After parsing, adjust the fetch_insn according to the probe_arg */ +static int finalize_fetch_insn(struct fetch_insn *code, + struct probe_arg *parg, + char *type, + int type_offset, + struct traceprobe_parse_context *ctx) +{ + struct fetch_insn *scode; + int ret; - ret = -EINVAL; /* Store operation */ if (parg->type->is_string) { + /* Check bad combination of the type and the last fetch_insn. */ if (!strcmp(parg->type->name, "symstr")) { if (code->op != FETCH_OP_REG && code->op != FETCH_OP_STACK && code->op != FETCH_OP_RETVAL && code->op != FETCH_OP_ARG && code->op != FETCH_OP_DEREF && code->op != FETCH_OP_TP_ARG) { - trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), + trace_probe_log_err(ctx->offset + type_offset, BAD_SYMSTRING); - goto fail; + return -EINVAL; } } else { if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF && code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM && code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) { - trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), + trace_probe_log_err(ctx->offset + type_offset, BAD_STRING); - goto fail; + return -EINVAL; } } + if (!strcmp(parg->type->name, "symstr") || (code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM || code->op == FETCH_OP_DATA) || code->op == FETCH_OP_TP_ARG || @@ -1244,9 +1330,10 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, code++; if (code->op != FETCH_OP_NOP) { trace_probe_log_err(ctx->offset, TOO_MANY_OPS); - goto fail; + return -EINVAL; } } + /* If op == DEREF, replace it with STRING */ if (!strcmp(parg->type->name, "ustring") || code->op == FETCH_OP_UDEREF) @@ -1267,47 +1354,128 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, code++; if (code->op != FETCH_OP_NOP) { trace_probe_log_err(ctx->offset, TOO_MANY_OPS); - goto fail; + return -E2BIG; } code->op = FETCH_OP_ST_RAW; code->size = parg->type->size; } + + /* Save storing fetch_insn. */ scode = code; + /* Modify operation */ - if (t != NULL) { - ret = __parse_bitfield_probe_arg(t, parg->type, &code); + if (type != NULL) { + /* Bitfield needs a special fetch_insn. */ + ret = __parse_bitfield_probe_arg(type, parg->type, &code); if (ret) { - trace_probe_log_err(ctx->offset + t - arg, BAD_BITFIELD); - goto fail; + trace_probe_log_err(ctx->offset + type_offset, BAD_BITFIELD); + return ret; } } else if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) && ctx->last_type) { + /* If user not specified the type, try parsing BTF bitfield. */ ret = parse_btf_bitfield(&code, ctx); if (ret) - goto fail; + return ret; } - ret = -EINVAL; + /* Loop(Array) operation */ if (parg->count) { if (scode->op != FETCH_OP_ST_MEM && scode->op != FETCH_OP_ST_STRING && scode->op != FETCH_OP_ST_USTRING) { - trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), - BAD_STRING); - goto fail; + trace_probe_log_err(ctx->offset + type_offset, BAD_STRING); + return -EINVAL; } code++; if (code->op != FETCH_OP_NOP) { trace_probe_log_err(ctx->offset, TOO_MANY_OPS); - goto fail; + return -E2BIG; } code->op = FETCH_OP_LP_ARRAY; code->param = parg->count; } + + /* Finalize the fetch_insn array. */ code++; code->op = FETCH_OP_END; - ret = 0; + return 0; +} + +/* String length checking wrapper */ +static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, + struct probe_arg *parg, + struct traceprobe_parse_context *ctx) +{ + struct fetch_insn *code, *tmp = NULL; + char *type, *arg __free(kfree) = NULL; + int ret, len; + + len = strlen(argv); + if (len > MAX_ARGSTR_LEN) { + trace_probe_log_err(ctx->offset, ARG_TOO_LONG); + return -E2BIG; + } else if (len == 0) { + trace_probe_log_err(ctx->offset, NO_ARG_BODY); + return -EINVAL; + } + + arg = kstrdup(argv, GFP_KERNEL); + if (!arg) + return -ENOMEM; + + parg->comm = kstrdup(arg, GFP_KERNEL); + if (!parg->comm) + return -ENOMEM; + + type = parse_probe_arg_type(arg, parg, ctx); + if (IS_ERR(type)) + return PTR_ERR(type); + + code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL); + if (!code) + return -ENOMEM; + code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; + + ctx->last_type = NULL; + ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], + ctx); + if (ret < 0) + goto fail; + + /* Update storing type if BTF is available */ + if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) && + ctx->last_type) { + if (!type) { + parg->type = find_fetch_type_from_btf_type(ctx); + } else if (strstr(type, "string")) { + ret = check_prepare_btf_string_fetch(type, &code, ctx); + if (ret) + goto fail; + } + } + parg->offset = *size; + *size += parg->type->size * (parg->count ?: 1); + + if (parg->count) { + len = strlen(parg->type->fmttype) + 6; + parg->fmt = kmalloc(len, GFP_KERNEL); + if (!parg->fmt) { + ret = -ENOMEM; + goto fail; + } + snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype, + parg->count); + } + + ret = finalize_fetch_insn(code, parg, type, type ? type - arg : 0, ctx); + if (ret < 0) + goto fail; + + for (; code < tmp + FETCH_INSN_MAX; code++) + if (code->op == FETCH_OP_END) + break; /* Shrink down the code buffer */ parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL); if (!parg->code) @@ -1316,15 +1484,13 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, memcpy(parg->code, tmp, sizeof(*code) * (code - tmp + 1)); fail: - if (ret) { + if (ret < 0) { for (code = tmp; code < tmp + FETCH_INSN_MAX; code++) if (code->op == FETCH_NOP_SYMBOL || code->op == FETCH_OP_DATA) kfree(code->data); } kfree(tmp); -out: - kfree(arg); return ret; } @@ -1379,9 +1545,7 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg, struct probe_arg *parg = &tp->args[i]; const char *body; - /* Increment count for freeing args in error case */ - tp->nr_args++; - + ctx->tp = tp; body = strchr(arg, '='); if (body) { if (body - arg > MAX_ARG_NAME_LEN) { @@ -1438,7 +1602,8 @@ static int argv_has_var_arg(int argc, const char *argv[], int *args_idx, if (str_has_prefix(argv[i], "$arg")) { trace_probe_log_set_index(i + 2); - if (!tparg_is_function_entry(ctx->flags)) { + if (!tparg_is_function_entry(ctx->flags) && + !tparg_is_function_return(ctx->flags)) { trace_probe_log_err(0, NOFENTRY_ARGS); return -EINVAL; } @@ -1495,7 +1660,7 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[], { const struct btf_param *params = NULL; int i, j, n, used, ret, args_idx = -1; - const char **new_argv = NULL; + const char **new_argv __free(kfree) = NULL; ret = argv_has_var_arg(argc, argv, &args_idx, ctx); if (ret < 0) @@ -1534,7 +1699,7 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[], ret = sprint_nth_btf_arg(n, "", buf + used, bufsize - used, ctx); if (ret < 0) - goto error; + return ERR_PTR(ret); new_argv[j++] = buf + used; used += ret + 1; @@ -1548,25 +1713,78 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[], n = simple_strtoul(argv[i] + 4, &type, 10); if (type && !(*type == ':' || *type == '\0')) { trace_probe_log_err(0, BAD_VAR); - ret = -ENOENT; - goto error; + return ERR_PTR(-ENOENT); } /* Note: $argN starts from $arg1 */ ret = sprint_nth_btf_arg(n - 1, type, buf + used, bufsize - used, ctx); if (ret < 0) - goto error; + return ERR_PTR(ret); new_argv[j++] = buf + used; used += ret + 1; } else new_argv[j++] = argv[i]; } - return new_argv; + return_ptr(new_argv); +} -error: - kfree(new_argv); - return ERR_PTR(ret); +/* @buf: *buf must be equal to NULL. Caller must to free *buf */ +int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf) +{ + int i, used, ret; + const int bufsize = MAX_DENTRY_ARGS_LEN; + char *tmpbuf __free(kfree) = NULL; + + if (*buf) + return -EINVAL; + + used = 0; + for (i = 0; i < argc; i++) { + char *tmp __free(kfree) = NULL; + char *equal; + size_t arg_len; + + if (!glob_match("*:%p[dD]", argv[i])) + continue; + + if (!tmpbuf) { + tmpbuf = kmalloc(bufsize, GFP_KERNEL); + if (!tmpbuf) + return -ENOMEM; + } + + tmp = kstrdup(argv[i], GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + equal = strchr(tmp, '='); + if (equal) + *equal = '\0'; + arg_len = strlen(argv[i]); + tmp[arg_len - 4] = '\0'; + if (argv[i][arg_len - 1] == 'd') + ret = snprintf(tmpbuf + used, bufsize - used, + "%s%s+0x0(+0x%zx(%s)):string", + equal ? tmp : "", equal ? "=" : "", + offsetof(struct dentry, d_name.name), + equal ? equal + 1 : tmp); + else + ret = snprintf(tmpbuf + used, bufsize - used, + "%s%s+0x0(+0x%zx(+0x%zx(%s))):string", + equal ? tmp : "", equal ? "=" : "", + offsetof(struct dentry, d_name.name), + offsetof(struct file, f_path.dentry), + equal ? equal + 1 : tmp); + + if (ret >= bufsize - used) + return -ENOMEM; + argv[i] = tmpbuf + used; + used += ret + 1; + } + + *buf = no_free_ptr(tmpbuf); + return 0; } void traceprobe_finish_parse(struct traceprobe_parse_context *ctx) @@ -1761,12 +1979,18 @@ void trace_probe_cleanup(struct trace_probe *tp) for (i = 0; i < tp->nr_args; i++) traceprobe_free_probe_arg(&tp->args[i]); + if (tp->entry_arg) { + kfree(tp->entry_arg->code); + kfree(tp->entry_arg); + tp->entry_arg = NULL; + } + if (tp->event) trace_probe_unlink(tp); } int trace_probe_init(struct trace_probe *tp, const char *event, - const char *group, bool alloc_filter) + const char *group, bool alloc_filter, int nargs) { struct trace_event_call *call; size_t size = sizeof(struct trace_probe_event); @@ -1802,6 +2026,11 @@ int trace_probe_init(struct trace_probe *tp, const char *event, goto error; } + tp->nr_args = nargs; + /* Make sure pointers in args[] are NULL */ + if (nargs) + memset(tp->args, 0, sizeof(tp->args[0]) * nargs); + return 0; error: diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index c1877d018269..96792bc4b092 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -34,8 +34,8 @@ #define MAX_ARRAY_LEN 64 #define MAX_ARG_NAME_LEN 32 #define MAX_BTF_ARGS_LEN 128 +#define MAX_DENTRY_ARGS_LEN 256 #define MAX_STRING_SIZE PATH_MAX -#define MAX_ARG_BUF_LEN (MAX_TRACE_ARGS * MAX_ARG_NAME_LEN) /* Reserved field names */ #define FIELD_STRING_IP "__probe_ip" @@ -92,6 +92,7 @@ enum fetch_op { FETCH_OP_ARG, /* Function argument : .param */ FETCH_OP_FOFFS, /* File offset: .immediate */ FETCH_OP_DATA, /* Allocated data: .data */ + FETCH_OP_EDATA, /* Entry data: .offset */ // Stage 2 (dereference) op FETCH_OP_DEREF, /* Dereference: .offset */ FETCH_OP_UDEREF, /* User-space Dereference: .offset */ @@ -102,6 +103,7 @@ enum fetch_op { FETCH_OP_ST_STRING, /* String: .offset, .size */ FETCH_OP_ST_USTRING, /* User String: .offset, .size */ FETCH_OP_ST_SYMSTR, /* Kernel Symbol String: .offset, .size */ + FETCH_OP_ST_EDATA, /* Store Entry Data: .offset */ // Stage 4 (modify) op FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */ // Stage 5 (loop) op @@ -232,6 +234,11 @@ struct probe_arg { const struct fetch_type *type; /* Type of this argument */ }; +struct probe_entry_arg { + struct fetch_insn *code; + unsigned int size; /* The entry data size */ +}; + struct trace_uprobe_filter { rwlock_t rwlock; int nr_systemwide; @@ -253,6 +260,7 @@ struct trace_probe { struct trace_probe_event *event; ssize_t size; /* trace entry size */ unsigned int nr_args; + struct probe_entry_arg *entry_arg; /* This is only for return probe */ struct probe_arg args[]; }; @@ -338,7 +346,7 @@ static inline bool trace_probe_has_single_file(struct trace_probe *tp) } int trace_probe_init(struct trace_probe *tp, const char *event, - const char *group, bool alloc_filter); + const char *group, bool alloc_filter, int nargs); void trace_probe_cleanup(struct trace_probe *tp); int trace_probe_append(struct trace_probe *tp, struct trace_probe *to); void trace_probe_unlink(struct trace_probe *tp); @@ -355,6 +363,18 @@ int trace_probe_create(const char *raw_command, int (*createfn)(int, const char int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_args, u8 *data, void *field); +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API +int traceprobe_get_entry_data_size(struct trace_probe *tp); +/* This is a runtime function to store entry data */ +void store_trace_entry_data(void *edata, struct trace_probe *tp, struct pt_regs *regs); +#else /* !CONFIG_HAVE_FUNCTION_ARG_ACCESS_API */ +static inline int traceprobe_get_entry_data_size(struct trace_probe *tp) +{ + return 0; +} +#define store_trace_entry_data(edata, tp, regs) do { } while (0) +#endif + #define trace_probe_for_each_link(pos, tp) \ list_for_each_entry(pos, &(tp)->event->files, list) #define trace_probe_for_each_link_rcu(pos, tp) \ @@ -381,6 +401,11 @@ static inline bool tparg_is_function_entry(unsigned int flags) return (flags & TPARG_FL_LOC_MASK) == (TPARG_FL_KERNEL | TPARG_FL_FENTRY); } +static inline bool tparg_is_function_return(unsigned int flags) +{ + return (flags & TPARG_FL_LOC_MASK) == (TPARG_FL_KERNEL | TPARG_FL_RETURN); +} + struct traceprobe_parse_context { struct trace_event_call *event; /* BTF related parameters */ @@ -392,6 +417,7 @@ struct traceprobe_parse_context { const struct btf_type *last_type; /* Saved type */ u32 last_bitoffs; /* Saved bitoffs */ u32 last_bitsize; /* Saved bitsize */ + struct trace_probe *tp; unsigned int flags; int offset; }; @@ -402,6 +428,7 @@ extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char **traceprobe_expand_meta_args(int argc, const char *argv[], int *new_argc, char *buf, int bufsize, struct traceprobe_parse_context *ctx); +extern int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf); extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); @@ -453,6 +480,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NON_UNIQ_SYMBOL, "The symbol is not unique"), \ C(BAD_RETPROBE, "Retprobe address must be an function entry"), \ C(NO_TRACEPOINT, "Tracepoint is not found"), \ + C(BAD_TP_NAME, "Invalid character in tracepoint name"),\ C(BAD_ADDR_SUFFIX, "Invalid probed address suffix"), \ C(NO_GROUP_NAME, "Group name is not specified"), \ C(GROUP_TOO_LONG, "Group name is too long"), \ @@ -506,7 +534,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NO_BTFARG, "This variable is not found at this probe point"),\ C(NO_BTF_ENTRY, "No BTF entry for this probe point"), \ C(BAD_VAR_ARGS, "$arg* must be an independent parameter without name etc."),\ - C(NOFENTRY_ARGS, "$arg* can be used only on function entry"), \ + C(NOFENTRY_ARGS, "$arg* can be used only on function entry or exit"), \ C(DOUBLE_ARGS, "$arg* can be used only once in the parameters"), \ C(ARGS_2LONG, "$arg* failed because the argument list is too long"), \ C(ARGIDX_2BIG, "$argN index is too big"), \ @@ -516,7 +544,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NO_BTF_FIELD, "This field is not found."), \ C(BAD_BTF_TID, "Failed to get BTF type info."),\ C(BAD_TYPE4STR, "This type does not fit for string."),\ - C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"), + C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),\ + C(TOO_MANY_EARGS, "Too many entry arguments specified"), #undef C #define C(a, b) TP_ERR_##a diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 3935b347f874..f39b37fcdb3b 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -54,7 +54,7 @@ fetch_apply_bitfield(struct fetch_insn *code, void *buf) * If dest is NULL, don't store result and return required dynamic data size. */ static int -process_fetch_insn(struct fetch_insn *code, void *rec, +process_fetch_insn(struct fetch_insn *code, void *rec, void *edata, void *dest, void *base); static nokprobe_inline int fetch_store_strlen(unsigned long addr); static nokprobe_inline int @@ -232,7 +232,7 @@ array: /* Sum up total data length for dynamic arrays (strings) */ static nokprobe_inline int -__get_data_size(struct trace_probe *tp, struct pt_regs *regs) +__get_data_size(struct trace_probe *tp, void *regs, void *edata) { struct probe_arg *arg; int i, len, ret = 0; @@ -240,7 +240,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs) for (i = 0; i < tp->nr_args; i++) { arg = tp->args + i; if (unlikely(arg->dynamic)) { - len = process_fetch_insn(arg->code, regs, NULL, NULL); + len = process_fetch_insn(arg->code, regs, edata, NULL, NULL); if (len > 0) ret += len; } @@ -251,7 +251,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs) /* Store the value of each argument */ static nokprobe_inline void -store_trace_args(void *data, struct trace_probe *tp, void *rec, +store_trace_args(void *data, struct trace_probe *tp, void *rec, void *edata, int header_size, int maxlen) { struct probe_arg *arg; @@ -266,7 +266,7 @@ store_trace_args(void *data, struct trace_probe *tp, void *rec, /* Point the dynamic data area if needed */ if (unlikely(arg->dynamic)) *dl = make_data_loc(maxlen, dyndata - base); - ret = process_fetch_insn(arg->code, rec, dl, base); + ret = process_fetch_insn(arg->code, rec, edata, dl, base); if (arg->dynamic && likely(ret > 0)) { dyndata += ret; maxlen -= ret; diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index c9ffdcfe622e..cb49f7279dc8 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/kallsyms.h> #include <linux/uaccess.h> +#include <linux/kmemleak.h> #include <linux/ftrace.h> #include <trace/events/sched.h> @@ -148,3 +149,517 @@ void tracing_stop_tgid_record(void) { tracing_stop_sched_switch(RECORD_TGID); } + +/* + * The tgid_map array maps from pid to tgid; i.e. the value stored at index i + * is the tgid last observed corresponding to pid=i. + */ +static int *tgid_map; + +/* The maximum valid index into tgid_map. */ +static size_t tgid_map_max; + +#define SAVED_CMDLINES_DEFAULT 128 +#define NO_CMDLINE_MAP UINT_MAX +/* + * Preemption must be disabled before acquiring trace_cmdline_lock. + * The various trace_arrays' max_lock must be acquired in a context + * where interrupt is disabled. + */ +static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; +struct saved_cmdlines_buffer { + unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; + unsigned *map_cmdline_to_pid; + unsigned cmdline_num; + int cmdline_idx; + char saved_cmdlines[]; +}; +static struct saved_cmdlines_buffer *savedcmd; + +/* Holds the size of a cmdline and pid element */ +#define SAVED_CMDLINE_MAP_ELEMENT_SIZE(s) \ + (TASK_COMM_LEN + sizeof((s)->map_cmdline_to_pid[0])) + +static inline char *get_saved_cmdlines(int idx) +{ + return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; +} + +static inline void set_cmdline(int idx, const char *cmdline) +{ + strscpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); +} + +static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) +{ + int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN); + + kmemleak_free(s); + free_pages((unsigned long)s, order); +} + +static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val) +{ + struct saved_cmdlines_buffer *s; + struct page *page; + int orig_size, size; + int order; + + /* Figure out how much is needed to hold the given number of cmdlines */ + orig_size = sizeof(*s) + val * SAVED_CMDLINE_MAP_ELEMENT_SIZE(s); + order = get_order(orig_size); + size = 1 << (order + PAGE_SHIFT); + page = alloc_pages(GFP_KERNEL, order); + if (!page) + return NULL; + + s = page_address(page); + kmemleak_alloc(s, size, 1, GFP_KERNEL); + memset(s, 0, sizeof(*s)); + + /* Round up to actual allocation */ + val = (size - sizeof(*s)) / SAVED_CMDLINE_MAP_ELEMENT_SIZE(s); + s->cmdline_num = val; + + /* Place map_cmdline_to_pid array right after saved_cmdlines */ + s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN]; + + s->cmdline_idx = 0; + memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, + sizeof(s->map_pid_to_cmdline)); + memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, + val * sizeof(*s->map_cmdline_to_pid)); + + return s; +} + +int trace_create_savedcmd(void) +{ + savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT); + + return savedcmd ? 0 : -ENOMEM; +} + +int trace_save_cmdline(struct task_struct *tsk) +{ + unsigned tpid, idx; + + /* treat recording of idle task as a success */ + if (!tsk->pid) + return 1; + + tpid = tsk->pid & (PID_MAX_DEFAULT - 1); + + /* + * It's not the end of the world if we don't get + * the lock, but we also don't want to spin + * nor do we want to disable interrupts, + * so if we miss here, then better luck next time. + * + * This is called within the scheduler and wake up, so interrupts + * had better been disabled and run queue lock been held. + */ + lockdep_assert_preemption_disabled(); + if (!arch_spin_trylock(&trace_cmdline_lock)) + return 0; + + idx = savedcmd->map_pid_to_cmdline[tpid]; + if (idx == NO_CMDLINE_MAP) { + idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; + + savedcmd->map_pid_to_cmdline[tpid] = idx; + savedcmd->cmdline_idx = idx; + } + + savedcmd->map_cmdline_to_pid[idx] = tsk->pid; + set_cmdline(idx, tsk->comm); + + arch_spin_unlock(&trace_cmdline_lock); + + return 1; +} + +static void __trace_find_cmdline(int pid, char comm[]) +{ + unsigned map; + int tpid; + + if (!pid) { + strcpy(comm, "<idle>"); + return; + } + + if (WARN_ON_ONCE(pid < 0)) { + strcpy(comm, "<XXX>"); + return; + } + + tpid = pid & (PID_MAX_DEFAULT - 1); + map = savedcmd->map_pid_to_cmdline[tpid]; + if (map != NO_CMDLINE_MAP) { + tpid = savedcmd->map_cmdline_to_pid[map]; + if (tpid == pid) { + strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); + return; + } + } + strcpy(comm, "<...>"); +} + +void trace_find_cmdline(int pid, char comm[]) +{ + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + + __trace_find_cmdline(pid, comm); + + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); +} + +static int *trace_find_tgid_ptr(int pid) +{ + /* + * Pairs with the smp_store_release in set_tracer_flag() to ensure that + * if we observe a non-NULL tgid_map then we also observe the correct + * tgid_map_max. + */ + int *map = smp_load_acquire(&tgid_map); + + if (unlikely(!map || pid > tgid_map_max)) + return NULL; + + return &map[pid]; +} + +int trace_find_tgid(int pid) +{ + int *ptr = trace_find_tgid_ptr(pid); + + return ptr ? *ptr : 0; +} + +static int trace_save_tgid(struct task_struct *tsk) +{ + int *ptr; + + /* treat recording of idle task as a success */ + if (!tsk->pid) + return 1; + + ptr = trace_find_tgid_ptr(tsk->pid); + if (!ptr) + return 0; + + *ptr = tsk->tgid; + return 1; +} + +static bool tracing_record_taskinfo_skip(int flags) +{ + if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID)))) + return true; + if (!__this_cpu_read(trace_taskinfo_save)) + return true; + return false; +} + +/** + * tracing_record_taskinfo - record the task info of a task + * + * @task: task to record + * @flags: TRACE_RECORD_CMDLINE for recording comm + * TRACE_RECORD_TGID for recording tgid + */ +void tracing_record_taskinfo(struct task_struct *task, int flags) +{ + bool done; + + if (tracing_record_taskinfo_skip(flags)) + return; + + /* + * Record as much task information as possible. If some fail, continue + * to try to record the others. + */ + done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task); + + /* If recording any information failed, retry again soon. */ + if (!done) + return; + + __this_cpu_write(trace_taskinfo_save, false); +} + +/** + * tracing_record_taskinfo_sched_switch - record task info for sched_switch + * + * @prev: previous task during sched_switch + * @next: next task during sched_switch + * @flags: TRACE_RECORD_CMDLINE for recording comm + * TRACE_RECORD_TGID for recording tgid + */ +void tracing_record_taskinfo_sched_switch(struct task_struct *prev, + struct task_struct *next, int flags) +{ + bool done; + + if (tracing_record_taskinfo_skip(flags)) + return; + + /* + * Record as much task information as possible. If some fail, continue + * to try to record the others. + */ + done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev); + done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next); + + /* If recording any information failed, retry again soon. */ + if (!done) + return; + + __this_cpu_write(trace_taskinfo_save, false); +} + +/* Helpers to record a specific task information */ +void tracing_record_cmdline(struct task_struct *task) +{ + tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE); +} + +void tracing_record_tgid(struct task_struct *task) +{ + tracing_record_taskinfo(task, TRACE_RECORD_TGID); +} + +int trace_alloc_tgid_map(void) +{ + int *map; + + if (tgid_map) + return 0; + + tgid_map_max = init_pid_ns.pid_max; + map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map), + GFP_KERNEL); + if (!map) + return -ENOMEM; + + /* + * Pairs with smp_load_acquire() in + * trace_find_tgid_ptr() to ensure that if it observes + * the tgid_map we just allocated then it also observes + * the corresponding tgid_map_max value. + */ + smp_store_release(&tgid_map, map); + return 0; +} + +static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos) +{ + int pid = ++(*pos); + + return trace_find_tgid_ptr(pid); +} + +static void *saved_tgids_start(struct seq_file *m, loff_t *pos) +{ + int pid = *pos; + + return trace_find_tgid_ptr(pid); +} + +static void saved_tgids_stop(struct seq_file *m, void *v) +{ +} + +static int saved_tgids_show(struct seq_file *m, void *v) +{ + int *entry = (int *)v; + int pid = entry - tgid_map; + int tgid = *entry; + + if (tgid == 0) + return SEQ_SKIP; + + seq_printf(m, "%d %d\n", pid, tgid); + return 0; +} + +static const struct seq_operations tracing_saved_tgids_seq_ops = { + .start = saved_tgids_start, + .stop = saved_tgids_stop, + .next = saved_tgids_next, + .show = saved_tgids_show, +}; + +static int tracing_saved_tgids_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; + + return seq_open(filp, &tracing_saved_tgids_seq_ops); +} + + +const struct file_operations tracing_saved_tgids_fops = { + .open = tracing_saved_tgids_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) +{ + unsigned int *ptr = v; + + if (*pos || m->count) + ptr++; + + (*pos)++; + + for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num]; + ptr++) { + if (*ptr == -1 || *ptr == NO_CMDLINE_MAP) + continue; + + return ptr; + } + + return NULL; +} + +static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) +{ + void *v; + loff_t l = 0; + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + + v = &savedcmd->map_cmdline_to_pid[0]; + while (l <= *pos) { + v = saved_cmdlines_next(m, v, &l); + if (!v) + return NULL; + } + + return v; +} + +static void saved_cmdlines_stop(struct seq_file *m, void *v) +{ + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); +} + +static int saved_cmdlines_show(struct seq_file *m, void *v) +{ + char buf[TASK_COMM_LEN]; + unsigned int *pid = v; + + __trace_find_cmdline(*pid, buf); + seq_printf(m, "%d %s\n", *pid, buf); + return 0; +} + +static const struct seq_operations tracing_saved_cmdlines_seq_ops = { + .start = saved_cmdlines_start, + .next = saved_cmdlines_next, + .stop = saved_cmdlines_stop, + .show = saved_cmdlines_show, +}; + +static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; + + return seq_open(filp, &tracing_saved_cmdlines_seq_ops); +} + +const struct file_operations tracing_saved_cmdlines_fops = { + .open = tracing_saved_cmdlines_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static ssize_t +tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +void trace_free_saved_cmdlines_buffer(void) +{ + free_saved_cmdlines_buffer(savedcmd); +} + +static int tracing_resize_saved_cmdlines(unsigned int val) +{ + struct saved_cmdlines_buffer *s, *savedcmd_temp; + + s = allocate_cmdlines_buffer(val); + if (!s) + return -ENOMEM; + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + savedcmd_temp = savedcmd; + savedcmd = s; + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); + free_saved_cmdlines_buffer(savedcmd_temp); + + return 0; +} + +static ssize_t +tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + /* must have at least 1 entry or less than PID_MAX_DEFAULT */ + if (!val || val > PID_MAX_DEFAULT) + return -EINVAL; + + ret = tracing_resize_saved_cmdlines((unsigned int)val); + if (ret < 0) + return ret; + + *ppos += cnt; + + return cnt; +} + +const struct file_operations tracing_saved_cmdlines_size_fops = { + .open = tracing_open_generic, + .read = tracing_saved_cmdlines_size_read, + .write = tracing_saved_cmdlines_size_write, +}; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 0469a04a355f..af30586f1aea 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -112,14 +112,17 @@ static int wakeup_display_graph(struct trace_array *tr, int set) return start_func_tracer(tr, set); } -static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +static int wakeup_graph_entry(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; unsigned int trace_ctx; + u64 *calltime; int ret = 0; - if (ftrace_graph_ignore_func(trace)) + if (ftrace_graph_ignore_func(gops, trace)) return 0; /* * Do not trace a function if it's filtered by set_graph_notrace. @@ -134,6 +137,12 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace) if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) return 0; + calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime)); + if (!calltime) + return 0; + + *calltime = trace_clock_local(); + ret = __trace_graph_entry(tr, trace, trace_ctx); atomic_dec(&data->disabled); preempt_enable_notrace(); @@ -141,18 +150,29 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace) return ret; } -static void wakeup_graph_return(struct ftrace_graph_ret *trace) +static void wakeup_graph_return(struct ftrace_graph_ret *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; unsigned int trace_ctx; + u64 *calltime; + u64 rettime; + int size; - ftrace_graph_addr_finish(trace); + ftrace_graph_addr_finish(gops, trace); if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) return; - __trace_graph_return(tr, trace, trace_ctx); + rettime = trace_clock_local(); + + calltime = fgraph_retrieve_data(gops->idx, &size); + if (!calltime) + return; + + __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); atomic_dec(&data->disabled); preempt_enable_notrace(); @@ -376,7 +396,6 @@ tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *next, unsigned int trace_ctx) { - struct trace_event_call *call = &event_context_switch; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -394,8 +413,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_state = task_state_index(next); entry->next_cpu = task_cpu(next); - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); } static void @@ -404,7 +422,6 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *curr, unsigned int trace_ctx) { - struct trace_event_call *call = &event_wakeup; struct ring_buffer_event *event; struct ctx_switch_entry *entry; struct trace_buffer *buffer = tr->array_buffer.buffer; @@ -422,8 +439,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_state = task_state_index(wakee); entry->next_cpu = task_cpu(wakee); - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); } static void notrace @@ -545,7 +561,7 @@ probe_wakeup(void *ignore, struct task_struct *p) * - wakeup_dl handles tasks belonging to sched_dl class only. */ if (tracing_dl || (wakeup_dl && !dl_task(p)) || - (wakeup_rt && !dl_task(p) && !rt_task(p)) || + (wakeup_rt && !rt_or_dl_task(p)) || (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) return; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 529590499b1f..d88c44f1dfa5 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_PRINT: case TRACE_BRANCH: case TRACE_GRAPH_ENT: + case TRACE_GRAPH_RETADDR_ENT: case TRACE_GRAPH_RET: return 1; } @@ -756,19 +757,284 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) #ifdef CONFIG_FUNCTION_GRAPH_TRACER +#ifdef CONFIG_DYNAMIC_FTRACE + +#define CHAR_NUMBER 123 +#define SHORT_NUMBER 12345 +#define WORD_NUMBER 1234567890 +#define LONG_NUMBER 1234567890123456789LL +#define ERRSTR_BUFLEN 128 + +struct fgraph_fixture { + struct fgraph_ops gops; + int store_size; + const char *store_type_name; + char error_str_buf[ERRSTR_BUFLEN]; + char *error_str; +}; + +static __init int store_entry(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) +{ + struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops); + const char *type = fixture->store_type_name; + int size = fixture->store_size; + void *p; + + p = fgraph_reserve_data(gops->idx, size); + if (!p) { + snprintf(fixture->error_str_buf, ERRSTR_BUFLEN, + "Failed to reserve %s\n", type); + return 0; + } + + switch (size) { + case 1: + *(char *)p = CHAR_NUMBER; + break; + case 2: + *(short *)p = SHORT_NUMBER; + break; + case 4: + *(int *)p = WORD_NUMBER; + break; + case 8: + *(long long *)p = LONG_NUMBER; + break; + } + + return 1; +} + +static __init void store_return(struct ftrace_graph_ret *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) +{ + struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops); + const char *type = fixture->store_type_name; + long long expect = 0; + long long found = -1; + int size; + char *p; + + p = fgraph_retrieve_data(gops->idx, &size); + if (!p) { + snprintf(fixture->error_str_buf, ERRSTR_BUFLEN, + "Failed to retrieve %s\n", type); + return; + } + if (fixture->store_size > size) { + snprintf(fixture->error_str_buf, ERRSTR_BUFLEN, + "Retrieved size %d is smaller than expected %d\n", + size, (int)fixture->store_size); + return; + } + + switch (fixture->store_size) { + case 1: + expect = CHAR_NUMBER; + found = *(char *)p; + break; + case 2: + expect = SHORT_NUMBER; + found = *(short *)p; + break; + case 4: + expect = WORD_NUMBER; + found = *(int *)p; + break; + case 8: + expect = LONG_NUMBER; + found = *(long long *)p; + break; + } + + if (found != expect) { + snprintf(fixture->error_str_buf, ERRSTR_BUFLEN, + "%s returned not %lld but %lld\n", type, expect, found); + return; + } + fixture->error_str = NULL; +} + +static int __init init_fgraph_fixture(struct fgraph_fixture *fixture) +{ + char *func_name; + int len; + + snprintf(fixture->error_str_buf, ERRSTR_BUFLEN, + "Failed to execute storage %s\n", fixture->store_type_name); + fixture->error_str = fixture->error_str_buf; + + func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + len = strlen(func_name); + + return ftrace_set_filter(&fixture->gops.ops, func_name, len, 1); +} + +/* Test fgraph storage for each size */ +static int __init test_graph_storage_single(struct fgraph_fixture *fixture) +{ + int size = fixture->store_size; + int ret; + + pr_cont("PASSED\n"); + pr_info("Testing fgraph storage of %d byte%s: ", size, str_plural(size)); + + ret = init_fgraph_fixture(fixture); + if (ret && ret != -ENODEV) { + pr_cont("*Could not set filter* "); + return -1; + } + + ret = register_ftrace_graph(&fixture->gops); + if (ret) { + pr_warn("Failed to init store_bytes fgraph tracing\n"); + return -1; + } + + DYN_FTRACE_TEST_NAME(); + + unregister_ftrace_graph(&fixture->gops); + + if (fixture->error_str) { + pr_cont("*** %s ***", fixture->error_str); + return -1; + } + + return 0; +} + +static struct fgraph_fixture store_bytes[4] __initdata = { + [0] = { + .gops = { + .entryfunc = store_entry, + .retfunc = store_return, + }, + .store_size = 1, + .store_type_name = "byte", + }, + [1] = { + .gops = { + .entryfunc = store_entry, + .retfunc = store_return, + }, + .store_size = 2, + .store_type_name = "short", + }, + [2] = { + .gops = { + .entryfunc = store_entry, + .retfunc = store_return, + }, + .store_size = 4, + .store_type_name = "word", + }, + [3] = { + .gops = { + .entryfunc = store_entry, + .retfunc = store_return, + }, + .store_size = 8, + .store_type_name = "long long", + }, +}; + +static __init int test_graph_storage_multi(void) +{ + struct fgraph_fixture *fixture; + bool printed = false; + int i, j, ret; + + pr_cont("PASSED\n"); + pr_info("Testing multiple fgraph storage on a function: "); + + for (i = 0; i < ARRAY_SIZE(store_bytes); i++) { + fixture = &store_bytes[i]; + ret = init_fgraph_fixture(fixture); + if (ret && ret != -ENODEV) { + pr_cont("*Could not set filter* "); + printed = true; + goto out2; + } + } + + for (j = 0; j < ARRAY_SIZE(store_bytes); j++) { + fixture = &store_bytes[j]; + ret = register_ftrace_graph(&fixture->gops); + if (ret) { + pr_warn("Failed to init store_bytes fgraph tracing\n"); + printed = true; + goto out1; + } + } + + DYN_FTRACE_TEST_NAME(); +out1: + while (--j >= 0) { + fixture = &store_bytes[j]; + unregister_ftrace_graph(&fixture->gops); + + if (fixture->error_str && !printed) { + pr_cont("*** %s ***", fixture->error_str); + printed = true; + } + } +out2: + while (--i >= 0) { + fixture = &store_bytes[i]; + ftrace_free_filter(&fixture->gops.ops); + + if (fixture->error_str && !printed) { + pr_cont("*** %s ***", fixture->error_str); + printed = true; + } + } + return printed ? -1 : 0; +} + +/* Test the storage passed across function_graph entry and return */ +static __init int test_graph_storage(void) +{ + int ret; + + ret = test_graph_storage_single(&store_bytes[0]); + if (ret) + return ret; + ret = test_graph_storage_single(&store_bytes[1]); + if (ret) + return ret; + ret = test_graph_storage_single(&store_bytes[2]); + if (ret) + return ret; + ret = test_graph_storage_single(&store_bytes[3]); + if (ret) + return ret; + ret = test_graph_storage_multi(); + if (ret) + return ret; + return 0; +} +#else +static inline int test_graph_storage(void) { return 0; } +#endif /* CONFIG_DYNAMIC_FTRACE */ + /* Maximum number of functions to trace before diagnosing a hang */ #define GRAPH_MAX_FUNC_TEST 100000000 static unsigned int graph_hang_thresh; /* Wrap the real function entry probe to avoid possible hanging */ -static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) +static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { /* This is harmlessly racy, we want to approximately detect a hang */ if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { ftrace_graph_stop(); printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); - if (ftrace_dump_on_oops) { + if (ftrace_dump_on_oops_enabled()) { ftrace_dump(DUMP_ALL); /* ftrace_dump() disables tracing */ tracing_on(); @@ -776,7 +1042,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) return 0; } - return trace_graph_entry(trace); + return trace_graph_entry(trace, gops, fregs); } static struct fgraph_ops fgraph_ops __initdata = { @@ -812,7 +1078,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, * to detect and recover from possible hangs */ tracing_reset_online_cpus(&tr->array_buffer); - set_graph_array(tr); + fgraph_ops.private = tr; ret = register_ftrace_graph(&fgraph_ops); if (ret) { warn_failed_init_tracer(trace, ret); @@ -855,7 +1121,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, cond_resched(); tracing_reset_online_cpus(&tr->array_buffer); - set_graph_array(tr); + fgraph_ops.private = tr; /* * Some archs *cough*PowerPC*cough* add characters to the @@ -912,6 +1178,8 @@ trace_selftest_startup_function_graph(struct tracer *trace, ftrace_set_global_filter(NULL, 0, 1); #endif + ret = test_graph_storage(); + /* Don't test dynamic tracing, the function tracer already did */ out: /* Stop it if we failed */ @@ -1221,7 +1489,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) /* reset the max latency */ tr->max_latency = 0; - while (p->on_rq) { + while (task_is_runnable(p)) { /* * Sleep to make sure the -deadline thread is asleep too. * On virtual machines we can't rely on timings, diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 5a48dba912ea..14c6f272c4d8 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -514,26 +514,24 @@ static const struct file_operations stack_trace_filter_fops = { #endif /* CONFIG_DYNAMIC_FTRACE */ int -stack_trace_sysctl(struct ctl_table *table, int write, void *buffer, +stack_trace_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int was_enabled; int ret; - mutex_lock(&stack_sysctl_mutex); + guard(mutex)(&stack_sysctl_mutex); was_enabled = !!stack_tracer_enabled; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (was_enabled == !!stack_tracer_enabled)) - goto out; + return ret; if (stack_tracer_enabled) register_ftrace_function(&trace_ops); else unregister_ftrace_function(&trace_ops); - out: - mutex_unlock(&stack_sysctl_mutex); return ret; } diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index bb247beec447..b3b5586f104d 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -128,7 +128,7 @@ static int stat_seq_init(struct stat_session *session) int ret = 0; int i; - mutex_lock(&session->stat_mutex); + guard(mutex)(&session->stat_mutex); __reset_stat_session(session); if (!ts->stat_cmp) @@ -136,11 +136,11 @@ static int stat_seq_init(struct stat_session *session) stat = ts->stat_start(ts); if (!stat) - goto exit; + return 0; ret = insert_stat(root, stat, ts->stat_cmp); if (ret) - goto exit; + return ret; /* * Iterate over the tracer stat entries and store them in an rbtree. @@ -157,13 +157,10 @@ static int stat_seq_init(struct stat_session *session) goto exit_free_rbtree; } -exit: - mutex_unlock(&session->stat_mutex); return ret; exit_free_rbtree: __reset_stat_session(session); - mutex_unlock(&session->stat_mutex); return ret; } @@ -308,7 +305,7 @@ static int init_stat_file(struct stat_session *session) int register_stat_tracer(struct tracer_stat *trace) { struct stat_session *session, *node; - int ret = -EINVAL; + int ret; if (!trace) return -EINVAL; @@ -316,18 +313,18 @@ int register_stat_tracer(struct tracer_stat *trace) if (!trace->stat_start || !trace->stat_next || !trace->stat_show) return -EINVAL; + guard(mutex)(&all_stat_sessions_mutex); + /* Already registered? */ - mutex_lock(&all_stat_sessions_mutex); list_for_each_entry(node, &all_stat_sessions, session_list) { if (node->ts == trace) - goto out; + return -EINVAL; } - ret = -ENOMEM; /* Init the session */ session = kzalloc(sizeof(*session), GFP_KERNEL); if (!session) - goto out; + return -ENOMEM; session->ts = trace; INIT_LIST_HEAD(&session->session_list); @@ -336,16 +333,13 @@ int register_stat_tracer(struct tracer_stat *trace) ret = init_stat_file(session); if (ret) { destroy_session(session); - goto out; + return ret; } - ret = 0; /* Register */ list_add_tail(&session->session_list, &all_stat_sessions); - out: - mutex_unlock(&all_stat_sessions_mutex); - return ret; + return 0; } void unregister_stat_tracer(struct tracer_stat *trace) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 9c581d6da843..46aab0ab9350 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -299,6 +299,13 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) int syscall_nr; int size; + /* + * Syscall probe called with preemption enabled, but the ring + * buffer and per-cpu data require preemption to be disabled. + */ + might_fault(); + guard(preempt_notrace)(); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; @@ -338,6 +345,13 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) struct trace_event_buffer fbuffer; int syscall_nr; + /* + * Syscall probe called with preemption enabled, but the ring + * buffer and per-cpu data require preemption to be disabled. + */ + might_fault(); + guard(preempt_notrace)(); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; @@ -564,6 +578,7 @@ static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *re BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *)); /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ + perf_fetch_caller_regs(regs); *(struct pt_regs **)¶m = regs; param.syscall_nr = rec->nr; for (i = 0; i < sys_data->nb_args; i++) @@ -575,6 +590,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) { struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; + struct pt_regs *fake_regs; struct hlist_head *head; unsigned long args[6]; bool valid_prog_array; @@ -582,6 +598,13 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int rctx; int size; + /* + * Syscall probe called with preemption enabled, but the ring + * buffer and per-cpu data require preemption to be disabled. + */ + might_fault(); + guard(preempt_notrace)(); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; @@ -602,7 +625,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - rec = perf_trace_buf_alloc(size, NULL, &rctx); + rec = perf_trace_buf_alloc(size, &fake_regs, &rctx); if (!rec) return; @@ -611,7 +634,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args); if ((valid_prog_array && - !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) || + !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) || hlist_empty(head)) { perf_swevent_put_recursion_context(rctx); return; @@ -666,6 +689,7 @@ static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *reg } __aligned(8) param; /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ + perf_fetch_caller_regs(regs); *(struct pt_regs **)¶m = regs; param.syscall_nr = rec->nr; param.ret = rec->ret; @@ -676,12 +700,20 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; + struct pt_regs *fake_regs; struct hlist_head *head; bool valid_prog_array; int syscall_nr; int rctx; int size; + /* + * Syscall probe called with preemption enabled, but the ring + * buffer and per-cpu data require preemption to be disabled. + */ + might_fault(); + guard(preempt_notrace)(); + syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; @@ -701,7 +733,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - rec = perf_trace_buf_alloc(size, NULL, &rctx); + rec = perf_trace_buf_alloc(size, &fake_regs, &rctx); if (!rec) return; @@ -709,7 +741,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->ret = syscall_get_return_value(current, regs); if ((valid_prog_array && - !perf_call_bpf_exit(sys_data->exit_event, regs, rec)) || + !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) || hlist_empty(head)) { perf_swevent_put_recursion_context(rctx); return; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index a84b85d8aac1..ccc762fbb69c 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -17,6 +17,7 @@ #include <linux/string.h> #include <linux/rculist.h> #include <linux/filter.h> +#include <linux/percpu.h> #include "trace_dynevent.h" #include "trace_probe.h" @@ -58,11 +59,11 @@ struct trace_uprobe { struct dyn_event devent; struct uprobe_consumer consumer; struct path path; - struct inode *inode; char *filename; + struct uprobe *uprobe; unsigned long offset; unsigned long ref_ctr_offset; - unsigned long nhit; + unsigned long __percpu *nhits; struct trace_probe tp; }; @@ -88,9 +89,11 @@ static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev) static int register_uprobe_event(struct trace_uprobe *tu); static int unregister_uprobe_event(struct trace_uprobe *tu); -static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); +static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs, + __u64 *data); static int uretprobe_dispatcher(struct uprobe_consumer *con, - unsigned long func, struct pt_regs *regs); + unsigned long func, struct pt_regs *regs, + __u64 *data); #ifdef CONFIG_STACK_GROWSUP static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n) @@ -211,8 +214,8 @@ static unsigned long translate_user_vaddr(unsigned long file_offset) /* Note that we don't verify it, since the code does not come from user space */ static int -process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, - void *base) +process_fetch_insn(struct fetch_insn *code, void *rec, void *edata, + void *dest, void *base) { struct pt_regs *regs = rec; unsigned long val; @@ -337,7 +340,13 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) if (!tu) return ERR_PTR(-ENOMEM); - ret = trace_probe_init(&tu->tp, event, group, true); + tu->nhits = alloc_percpu(unsigned long); + if (!tu->nhits) { + ret = -ENOMEM; + goto error; + } + + ret = trace_probe_init(&tu->tp, event, group, true, nargs); if (ret < 0) goto error; @@ -349,6 +358,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) return tu; error: + free_percpu(tu->nhits); kfree(tu); return ERR_PTR(ret); @@ -362,6 +372,7 @@ static void free_trace_uprobe(struct trace_uprobe *tu) path_put(&tu->path); trace_probe_cleanup(&tu->tp); kfree(tu->filename); + free_percpu(tu->nhits); kfree(tu); } @@ -487,11 +498,11 @@ static int register_trace_uprobe(struct trace_uprobe *tu) struct trace_uprobe *old_tu; int ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); ret = validate_ref_ctr_offset(tu); if (ret) - goto end; + return ret; /* register as an event */ old_tu = find_probe_event(trace_probe_name(&tu->tp), @@ -500,11 +511,9 @@ static int register_trace_uprobe(struct trace_uprobe *tu) if (is_ret_probe(tu) != is_ret_probe(old_tu)) { trace_probe_log_set_index(0); trace_probe_log_err(0, DIFF_PROBE_TYPE); - ret = -EEXIST; - } else { - ret = append_trace_uprobe(tu, old_tu); + return -EEXIST; } - goto end; + return append_trace_uprobe(tu, old_tu); } ret = register_uprobe_event(tu); @@ -514,14 +523,11 @@ static int register_trace_uprobe(struct trace_uprobe *tu) trace_probe_log_err(0, EVENT_EXIST); } else pr_warn("Failed to register probe event(%d)\n", ret); - goto end; + return ret; } dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp)); -end: - mutex_unlock(&event_mutex); - return ret; } @@ -556,6 +562,8 @@ static int __trace_uprobe_create(int argc, const char **argv) if (argc < 2) return -ECANCELED; + if (argc - 2 > MAX_TRACE_ARGS) + return -E2BIG; if (argv[0][1] == ':') event = &argv[0][2]; @@ -681,7 +689,7 @@ static int __trace_uprobe_create(int argc, const char **argv) tu->filename = filename; /* parse arguments */ - for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + for (i = 0; i < argc; i++) { struct traceprobe_parse_context ctx = { .flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER, }; @@ -815,13 +823,21 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) { struct dyn_event *ev = v; struct trace_uprobe *tu; + unsigned long nhits; + int cpu; if (!is_trace_uprobe(ev)) return 0; tu = to_trace_uprobe(ev); + + nhits = 0; + for_each_possible_cpu(cpu) { + nhits += per_cpu(*tu->nhits, cpu); + } + seq_printf(m, " %s %-44s %15lu\n", tu->filename, - trace_probe_name(&tu->tp), tu->nhit); + trace_probe_name(&tu->tp), nhits); return 0; } @@ -854,9 +870,11 @@ static const struct file_operations uprobe_profile_ops = { struct uprobe_cpu_buffer { struct mutex mutex; void *buf; + int dsize; }; static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer; static int uprobe_buffer_refcnt; +#define MAX_UCB_BUFFER_SIZE PAGE_SIZE static int uprobe_buffer_init(void) { @@ -940,12 +958,41 @@ static struct uprobe_cpu_buffer *uprobe_buffer_get(void) static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb) { + if (!ucb) + return; mutex_unlock(&ucb->mutex); } +static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu, + struct pt_regs *regs, + struct uprobe_cpu_buffer **ucbp) +{ + struct uprobe_cpu_buffer *ucb; + int dsize, esize; + + if (*ucbp) + return *ucbp; + + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + dsize = __get_data_size(&tu->tp, regs, NULL); + + ucb = uprobe_buffer_get(); + ucb->dsize = tu->tp.size + dsize; + + if (WARN_ON_ONCE(ucb->dsize > MAX_UCB_BUFFER_SIZE)) { + ucb->dsize = MAX_UCB_BUFFER_SIZE; + dsize = MAX_UCB_BUFFER_SIZE - tu->tp.size; + } + + store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize); + + *ucbp = ucb; + return ucb; +} + static void __uprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize, + struct uprobe_cpu_buffer *ucb, struct trace_event_file *trace_file) { struct uprobe_trace_entry_head *entry; @@ -956,14 +1003,11 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, WARN_ON(call != trace_file->event_call); - if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE)) - return; - if (trace_trigger_soft_disabled(trace_file)) return; esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - size = esize + tu->tp.size + dsize; + size = esize + ucb->dsize; entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); if (!entry) return; @@ -977,23 +1021,26 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, data = DATAOF_TRACE_ENTRY(entry, false); } - memcpy(data, ucb->buf, tu->tp.size + dsize); + memcpy(data, ucb->buf, ucb->dsize); trace_event_buffer_commit(&fbuffer); } /* uprobe handler */ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer **ucbp) { struct event_file_link *link; + struct uprobe_cpu_buffer *ucb; if (is_ret_probe(tu)) return 0; + ucb = prepare_uprobe_buffer(tu, regs, ucbp); + rcu_read_lock(); trace_probe_for_each_link_rcu(link, &tu->tp) - __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file); + __uprobe_trace_func(tu, 0, regs, ucb, link->file); rcu_read_unlock(); return 0; @@ -1001,13 +1048,16 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer **ucbp) { struct event_file_link *link; + struct uprobe_cpu_buffer *ucb; + + ucb = prepare_uprobe_buffer(tu, regs, ucbp); rcu_read_lock(); trace_probe_for_each_link_rcu(link, &tu->tp) - __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file); + __uprobe_trace_func(tu, func, regs, ucb, link->file); rcu_read_unlock(); } @@ -1047,43 +1097,40 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e return trace_handle_return(s); } -typedef bool (*filter_func_t)(struct uprobe_consumer *self, - enum uprobe_filter_ctx ctx, - struct mm_struct *mm); +typedef bool (*filter_func_t)(struct uprobe_consumer *self, struct mm_struct *mm); static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter) { - int ret; + struct inode *inode = d_real_inode(tu->path.dentry); + struct uprobe *uprobe; tu->consumer.filter = filter; - tu->inode = d_real_inode(tu->path.dentry); + uprobe = uprobe_register(inode, tu->offset, tu->ref_ctr_offset, &tu->consumer); + if (IS_ERR(uprobe)) + return PTR_ERR(uprobe); - if (tu->ref_ctr_offset) - ret = uprobe_register_refctr(tu->inode, tu->offset, - tu->ref_ctr_offset, &tu->consumer); - else - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); - - if (ret) - tu->inode = NULL; - - return ret; + tu->uprobe = uprobe; + return 0; } static void __probe_event_disable(struct trace_probe *tp) { struct trace_uprobe *tu; + bool sync = false; tu = container_of(tp, struct trace_uprobe, tp); WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter)); list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { - if (!tu->inode) + if (!tu->uprobe) continue; - uprobe_unregister(tu->inode, tu->offset, &tu->consumer); - tu->inode = NULL; + uprobe_unregister_nosync(tu->uprobe, &tu->consumer); + sync = true; + tu->uprobe = NULL; } + if (sync) + uprobe_unregister_sync(); } static int probe_event_enable(struct trace_event_call *call, @@ -1199,9 +1246,6 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) { struct perf_event *event; - if (filter->nr_systemwide) - return true; - list_for_each_entry(event, &filter->perf_events, hw.tp_list) { if (event->hw.target->mm == mm) return true; @@ -1282,7 +1326,7 @@ static int uprobe_perf_close(struct trace_event_call *call, return 0; list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { - ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + ret = uprobe_apply(tu->uprobe, &tu->consumer, false); if (ret) break; } @@ -1306,7 +1350,7 @@ static int uprobe_perf_open(struct trace_event_call *call, return 0; list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { - err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + err = uprobe_apply(tu->uprobe, &tu->consumer, true); if (err) { uprobe_perf_close(call, event); break; @@ -1316,8 +1360,7 @@ static int uprobe_perf_open(struct trace_event_call *call, return err; } -static bool uprobe_perf_filter(struct uprobe_consumer *uc, - enum uprobe_filter_ctx ctx, struct mm_struct *mm) +static bool uprobe_perf_filter(struct uprobe_consumer *uc, struct mm_struct *mm) { struct trace_uprobe_filter *filter; struct trace_uprobe *tu; @@ -1326,6 +1369,13 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, tu = container_of(uc, struct trace_uprobe, consumer); filter = tu->tp.event->filter; + /* + * speculative short-circuiting check to avoid unnecessarily taking + * filter->rwlock below, if the uprobe has system-wide consumer + */ + if (READ_ONCE(filter->nr_systemwide)) + return true; + read_lock(&filter->rwlock); ret = __uprobe_perf_filter(filter, mm); read_unlock(&filter->rwlock); @@ -1335,10 +1385,11 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, static void __uprobe_perf_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer **ucbp) { struct trace_event_call *call = trace_probe_event_call(&tu->tp); struct uprobe_trace_entry_head *entry; + struct uprobe_cpu_buffer *ucb; struct hlist_head *head; void *data; int size, esize; @@ -1346,9 +1397,13 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, #ifdef CONFIG_BPF_EVENTS if (bpf_prog_array_valid(call)) { + const struct bpf_prog_array *array; u32 ret; - ret = bpf_prog_run_array_uprobe(call->prog_array, regs, bpf_prog_run); + rcu_read_lock_trace(); + array = rcu_dereference_check(call->prog_array, rcu_read_lock_trace_held()); + ret = bpf_prog_run_array_uprobe(array, regs, bpf_prog_run); + rcu_read_unlock_trace(); if (!ret) return; } @@ -1356,7 +1411,8 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - size = esize + tu->tp.size + dsize; + ucb = prepare_uprobe_buffer(tu, regs, ucbp); + size = esize + ucb->dsize; size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) return; @@ -1379,13 +1435,10 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, data = DATAOF_TRACE_ENTRY(entry, false); } - memcpy(data, ucb->buf, tu->tp.size + dsize); + memcpy(data, ucb->buf, ucb->dsize); - if (size - esize > tu->tp.size + dsize) { - int len = tu->tp.size + dsize; - - memset(data + len, 0, size - esize - len); - } + if (size - esize > ucb->dsize) + memset(data + ucb->dsize, 0, size - esize - ucb->dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); @@ -1395,21 +1448,21 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, /* uprobe profile handler */ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer **ucbp) { - if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) + if (!uprobe_perf_filter(&tu->consumer, current->mm)) return UPROBE_HANDLER_REMOVE; if (!is_ret_probe(tu)) - __uprobe_perf_func(tu, 0, regs, ucb, dsize); + __uprobe_perf_func(tu, 0, regs, ucbp); return 0; } static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer **ucbp) { - __uprobe_perf_func(tu, func, regs, ucb, dsize); + __uprobe_perf_func(tu, func, regs, ucbp); } int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, @@ -1470,17 +1523,17 @@ trace_uprobe_register(struct trace_event_call *event, enum trace_reg type, } } -static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) +static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs, + __u64 *data) { struct trace_uprobe *tu; struct uprobe_dispatch_data udd; - struct uprobe_cpu_buffer *ucb; - int dsize, esize; + struct uprobe_cpu_buffer *ucb = NULL; int ret = 0; - tu = container_of(con, struct trace_uprobe, consumer); - tu->nhit++; + + this_cpu_inc(*tu->nhits); udd.tu = tu; udd.bp_addr = instruction_pointer(regs); @@ -1490,30 +1543,24 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; - dsize = __get_data_size(&tu->tp, regs); - esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - - ucb = uprobe_buffer_get(); - store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize); - if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) - ret |= uprobe_trace_func(tu, regs, ucb, dsize); + ret |= uprobe_trace_func(tu, regs, &ucb); #ifdef CONFIG_PERF_EVENTS if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) - ret |= uprobe_perf_func(tu, regs, ucb, dsize); + ret |= uprobe_perf_func(tu, regs, &ucb); #endif uprobe_buffer_put(ucb); return ret; } static int uretprobe_dispatcher(struct uprobe_consumer *con, - unsigned long func, struct pt_regs *regs) + unsigned long func, struct pt_regs *regs, + __u64 *data) { struct trace_uprobe *tu; struct uprobe_dispatch_data udd; - struct uprobe_cpu_buffer *ucb; - int dsize, esize; + struct uprobe_cpu_buffer *ucb = NULL; tu = container_of(con, struct trace_uprobe, consumer); @@ -1525,18 +1572,12 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; - dsize = __get_data_size(&tu->tp, regs); - esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - - ucb = uprobe_buffer_get(); - store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize); - if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) - uretprobe_trace_func(tu, func, regs, ucb, dsize); + uretprobe_trace_func(tu, func, regs, &ucb); #ifdef CONFIG_PERF_EVENTS if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) - uretprobe_perf_func(tu, func, regs, ucb, dsize); + uretprobe_perf_func(tu, func, regs, &ucb); #endif uprobe_buffer_put(ucb); return 0; diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index a4dcf0f24352..1921ade45be3 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -454,7 +454,7 @@ static struct tracing_map_elt *get_free_elt(struct tracing_map *map) struct tracing_map_elt *elt = NULL; int idx; - idx = atomic_inc_return(&map->next_elt); + idx = atomic_fetch_add_unless(&map->next_elt, 1, map->max_elts); if (idx < map->max_elts) { elt = *(TRACING_MAP_ELT(map->elts, idx)); if (map->ops && map->ops->elt_init) @@ -699,7 +699,7 @@ void tracing_map_clear(struct tracing_map *map) { unsigned int i; - atomic_set(&map->next_elt, -1); + atomic_set(&map->next_elt, 0); atomic64_set(&map->hits, 0); atomic64_set(&map->drops, 0); @@ -783,7 +783,7 @@ struct tracing_map *tracing_map_create(unsigned int map_bits, map->map_bits = map_bits; map->max_elts = (1 << map_bits); - atomic_set(&map->next_elt, -1); + atomic_set(&map->next_elt, 0); map->map_size = (1 << (map_bits + 1)); map->ops = ops; @@ -845,15 +845,11 @@ int tracing_map_init(struct tracing_map *map) static int cmp_entries_dup(const void *A, const void *B) { const struct tracing_map_sort_entry *a, *b; - int ret = 0; a = *(const struct tracing_map_sort_entry **)A; b = *(const struct tracing_map_sort_entry **)B; - if (memcmp(a->key, b->key, a->elt->map->key_size)) - ret = 1; - - return ret; + return memcmp(a->key, b->key, a->elt->map->key_size); } static int cmp_entries_sum(const void *A, const void *B) |