diff options
Diffstat (limited to 'kernel/trace')
83 files changed, 4221 insertions, 1612 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a3f35c7d83b6..f135d04e1860 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -53,6 +53,12 @@ config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS config HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS bool +config HAVE_EXTRA_IPI_TRACEPOINTS + bool + help + For architectures that use ipi_raise, ipi_entry and ipi_exit + tracepoints. + config HAVE_DYNAMIC_FTRACE_WITH_ARGS bool help @@ -74,11 +80,6 @@ config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE If the architecture generates __patchable_function_entries sections but does not want them included in the ftrace locations. -config HAVE_FTRACE_MCOUNT_RECORD - bool - help - See Documentation/trace/ftrace-design.rst - config HAVE_SYSCALL_TRACEPOINTS bool help @@ -275,7 +276,7 @@ config FUNCTION_TRACE_ARGS funcgraph-args (for the function graph tracer) config DYNAMIC_FTRACE - bool "enable/disable function tracing dynamically" + bool depends on FUNCTION_TRACER depends on HAVE_DYNAMIC_FTRACE default y @@ -803,27 +804,22 @@ config BPF_KPROBE_OVERRIDE Allows BPF to override the execution of a probed function and set a different return value. This is used for error injection. -config FTRACE_MCOUNT_RECORD - def_bool y - depends on DYNAMIC_FTRACE - depends on HAVE_FTRACE_MCOUNT_RECORD - config FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY bool - depends on FTRACE_MCOUNT_RECORD + depends on DYNAMIC_FTRACE config FTRACE_MCOUNT_USE_CC def_bool y depends on $(cc-option,-mrecord-mcount) depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY - depends on FTRACE_MCOUNT_RECORD + depends on DYNAMIC_FTRACE config FTRACE_MCOUNT_USE_OBJTOOL def_bool y depends on HAVE_OBJTOOL_MCOUNT depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY depends on !FTRACE_MCOUNT_USE_CC - depends on FTRACE_MCOUNT_RECORD + depends on DYNAMIC_FTRACE select OBJTOOL config FTRACE_MCOUNT_USE_RECORDMCOUNT @@ -831,7 +827,7 @@ config FTRACE_MCOUNT_USE_RECORDMCOUNT depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY depends on !FTRACE_MCOUNT_USE_CC depends on !FTRACE_MCOUNT_USE_OBJTOOL - depends on FTRACE_MCOUNT_RECORD + depends on DYNAMIC_FTRACE config TRACING_MAP bool diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 3f6a7bdc6edf..47168d2afbf1 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1875,6 +1875,29 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf) case REQ_OP_READ: rwbs[i++] = 'R'; break; + case REQ_OP_ZONE_APPEND: + rwbs[i++] = 'Z'; + rwbs[i++] = 'A'; + break; + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_RESET_ALL: + rwbs[i++] = 'Z'; + rwbs[i++] = 'R'; + if ((opf & REQ_OP_MASK) == REQ_OP_ZONE_RESET_ALL) + rwbs[i++] = 'A'; + break; + case REQ_OP_ZONE_FINISH: + rwbs[i++] = 'Z'; + rwbs[i++] = 'F'; + break; + case REQ_OP_ZONE_OPEN: + rwbs[i++] = 'Z'; + rwbs[i++] = 'O'; + break; + case REQ_OP_ZONE_CLOSE: + rwbs[i++] = 'Z'; + rwbs[i++] = 'C'; + break; default: rwbs[i++] = 'N'; } @@ -1890,6 +1913,8 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf) if (opf & REQ_ATOMIC) rwbs[i++] = 'U'; + WARN_ON_ONCE(i >= RWBS_LEN); + rwbs[i] = '\0'; } EXPORT_SYMBOL_GPL(blk_fill_rwbs); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 187dc37d61d4..3ae52978cae6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -572,7 +572,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) return value; } -static const struct bpf_func_proto bpf_perf_event_read_proto = { +const struct bpf_func_proto bpf_perf_event_read_proto = { .func = bpf_perf_event_read, .gpl_only = true, .ret_type = RET_INTEGER, @@ -781,8 +781,7 @@ BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task) return (unsigned long) task_pt_regs(task); } -BTF_ID_LIST(bpf_task_pt_regs_ids) -BTF_ID(struct, pt_regs) +BTF_ID_LIST_SINGLE(bpf_task_pt_regs_ids, struct, pt_regs) const struct bpf_func_proto bpf_task_pt_regs_proto = { .func = bpf_task_pt_regs, @@ -882,7 +881,7 @@ BPF_CALL_1(bpf_send_signal, u32, sig) return bpf_send_signal_common(sig, PIDTYPE_TGID, NULL, 0); } -static const struct bpf_func_proto bpf_send_signal_proto = { +const struct bpf_func_proto bpf_send_signal_proto = { .func = bpf_send_signal, .gpl_only = false, .ret_type = RET_INTEGER, @@ -894,7 +893,7 @@ BPF_CALL_1(bpf_send_signal_thread, u32, sig) return bpf_send_signal_common(sig, PIDTYPE_PID, NULL, 0); } -static const struct bpf_func_proto bpf_send_signal_thread_proto = { +const struct bpf_func_proto bpf_send_signal_thread_proto = { .func = bpf_send_signal_thread, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1185,7 +1184,7 @@ BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags) return entry_cnt * br_entry_size; } -static const struct bpf_func_proto bpf_get_branch_snapshot_proto = { +const struct bpf_func_proto bpf_get_branch_snapshot_proto = { .func = bpf_get_branch_snapshot, .gpl_only = true, .ret_type = RET_INTEGER, @@ -1270,7 +1269,7 @@ __bpf_kfunc_start_defs(); * Return: a bpf_key pointer with a valid key pointer if the key is found, a * NULL pointer otherwise. */ -__bpf_kfunc struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags) +__bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags) { key_ref_t key_ref; struct bpf_key *bkey; @@ -1430,56 +1429,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) const struct bpf_func_proto *func_proto; switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - case BPF_FUNC_map_push_elem: - return &bpf_map_push_elem_proto; - case BPF_FUNC_map_pop_elem: - return &bpf_map_pop_elem_proto; - case BPF_FUNC_map_peek_elem: - return &bpf_map_peek_elem_proto; - case BPF_FUNC_map_lookup_percpu_elem: - return &bpf_map_lookup_percpu_elem_proto; - case BPF_FUNC_ktime_get_ns: - return &bpf_ktime_get_ns_proto; - case BPF_FUNC_ktime_get_boot_ns: - return &bpf_ktime_get_boot_ns_proto; - case BPF_FUNC_tail_call: - return &bpf_tail_call_proto; - case BPF_FUNC_get_current_task: - return &bpf_get_current_task_proto; - case BPF_FUNC_get_current_task_btf: - return &bpf_get_current_task_btf_proto; - case BPF_FUNC_task_pt_regs: - return &bpf_task_pt_regs_proto; - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_comm: - return &bpf_get_current_comm_proto; - case BPF_FUNC_trace_printk: - return bpf_get_trace_printk_proto(); case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; - case BPF_FUNC_get_numa_node_id: - return &bpf_get_numa_node_id_proto; - case BPF_FUNC_perf_event_read: - return &bpf_perf_event_read_proto; - case BPF_FUNC_get_prandom_u32: - return &bpf_get_prandom_u32_proto; - case BPF_FUNC_probe_read_user: - return &bpf_probe_read_user_proto; - case BPF_FUNC_probe_read_kernel: - return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? - NULL : &bpf_probe_read_kernel_proto; - case BPF_FUNC_probe_read_user_str: - return &bpf_probe_read_user_str_proto; - case BPF_FUNC_probe_read_kernel_str: - return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? - NULL : &bpf_probe_read_kernel_str_proto; #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE case BPF_FUNC_probe_read: return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? @@ -1488,65 +1439,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_compat_str_proto; #endif -#ifdef CONFIG_CGROUPS - case BPF_FUNC_cgrp_storage_get: - return &bpf_cgrp_storage_get_proto; - case BPF_FUNC_cgrp_storage_delete: - return &bpf_cgrp_storage_delete_proto; - case BPF_FUNC_current_task_under_cgroup: - return &bpf_current_task_under_cgroup_proto; -#endif - case BPF_FUNC_send_signal: - return &bpf_send_signal_proto; - case BPF_FUNC_send_signal_thread: - return &bpf_send_signal_thread_proto; - case BPF_FUNC_perf_event_read_value: - return &bpf_perf_event_read_value_proto; - case BPF_FUNC_ringbuf_output: - return &bpf_ringbuf_output_proto; - case BPF_FUNC_ringbuf_reserve: - return &bpf_ringbuf_reserve_proto; - case BPF_FUNC_ringbuf_submit: - return &bpf_ringbuf_submit_proto; - case BPF_FUNC_ringbuf_discard: - return &bpf_ringbuf_discard_proto; - case BPF_FUNC_ringbuf_query: - return &bpf_ringbuf_query_proto; - case BPF_FUNC_jiffies64: - return &bpf_jiffies64_proto; - case BPF_FUNC_get_task_stack: - return prog->sleepable ? &bpf_get_task_stack_sleepable_proto - : &bpf_get_task_stack_proto; - case BPF_FUNC_copy_from_user: - return &bpf_copy_from_user_proto; - case BPF_FUNC_copy_from_user_task: - return &bpf_copy_from_user_task_proto; - case BPF_FUNC_snprintf_btf: - return &bpf_snprintf_btf_proto; - case BPF_FUNC_per_cpu_ptr: - return &bpf_per_cpu_ptr_proto; - case BPF_FUNC_this_cpu_ptr: - return &bpf_this_cpu_ptr_proto; - case BPF_FUNC_task_storage_get: - if (bpf_prog_check_recur(prog)) - return &bpf_task_storage_get_recur_proto; - return &bpf_task_storage_get_proto; - case BPF_FUNC_task_storage_delete: - if (bpf_prog_check_recur(prog)) - return &bpf_task_storage_delete_recur_proto; - return &bpf_task_storage_delete_proto; - case BPF_FUNC_for_each_map_elem: - return &bpf_for_each_map_elem_proto; - case BPF_FUNC_snprintf: - return &bpf_snprintf_proto; case BPF_FUNC_get_func_ip: return &bpf_get_func_ip_proto_tracing; - case BPF_FUNC_get_branch_snapshot: - return &bpf_get_branch_snapshot_proto; - case BPF_FUNC_find_vma: - return &bpf_find_vma_proto; - case BPF_FUNC_trace_vprintk: - return bpf_get_trace_vprintk_proto(); default: break; } @@ -1858,7 +1752,7 @@ static struct pt_regs *get_bpf_raw_tp_regs(void) struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs); int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level); - if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) { + if (nest_level > ARRAY_SIZE(tp_regs->regs)) { this_cpu_dec(bpf_raw_tp_nest_level); return ERR_PTR(-EBUSY); } @@ -2571,7 +2465,6 @@ struct bpf_kprobe_multi_link { u32 cnt; u32 mods_cnt; struct module **mods; - u32 flags; }; struct bpf_kprobe_multi_run_ctx { @@ -2691,7 +2584,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); info->kprobe_multi.count = kmulti_link->cnt; - info->kprobe_multi.flags = kmulti_link->flags; + info->kprobe_multi.flags = kmulti_link->link.flags; info->kprobe_multi.missed = kmulti_link->fp.nmissed; if (!uaddrs) @@ -2725,10 +2618,37 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, return err; } +#ifdef CONFIG_PROC_FS +static void bpf_kprobe_multi_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_kprobe_multi_link *kmulti_link; + + kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); + + seq_printf(seq, + "kprobe_cnt:\t%u\n" + "missed:\t%lu\n", + kmulti_link->cnt, + kmulti_link->fp.nmissed); + + seq_printf(seq, "%s\t %s\n", "cookie", "func"); + for (int i = 0; i < kmulti_link->cnt; i++) { + seq_printf(seq, + "%llu\t %pS\n", + kmulti_link->cookies[i], + (void *)kmulti_link->addrs[i]); + } +} +#endif + static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { .release = bpf_kprobe_multi_link_release, .dealloc_deferred = bpf_kprobe_multi_link_dealloc, .fill_link_info = bpf_kprobe_multi_link_fill_link_info, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_kprobe_multi_show_fdinfo, +#endif }; static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv) @@ -2987,6 +2907,9 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; + if (attr->link_create.flags) + return -EINVAL; + if (!is_kprobe_multi(prog)) return -EINVAL; @@ -3062,7 +2985,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr } bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI, - &bpf_kprobe_multi_link_lops, prog); + &bpf_kprobe_multi_link_lops, prog, attr->link_create.attach_type); err = bpf_link_prime(&link->link, &link_primer); if (err) @@ -3078,7 +3001,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr link->addrs = addrs; link->cookies = cookies; link->cnt = cnt; - link->flags = flags; + link->link.flags = flags; if (cookies) { /* @@ -3147,7 +3070,6 @@ struct bpf_uprobe_multi_link { struct path path; struct bpf_link link; u32 cnt; - u32 flags; struct bpf_uprobe *uprobes; struct task_struct *task; }; @@ -3211,7 +3133,7 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); info->uprobe_multi.count = umulti_link->cnt; - info->uprobe_multi.flags = umulti_link->flags; + info->uprobe_multi.flags = umulti_link->link.flags; info->uprobe_multi.pid = umulti_link->task ? task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; @@ -3256,10 +3178,54 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, return err; } +#ifdef CONFIG_PROC_FS +static void bpf_uprobe_multi_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_uprobe_multi_link *umulti_link; + char *p, *buf; + pid_t pid; + + umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); + + buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!buf) + return; + + p = d_path(&umulti_link->path, buf, PATH_MAX); + if (IS_ERR(p)) { + kfree(buf); + return; + } + + pid = umulti_link->task ? + task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; + seq_printf(seq, + "uprobe_cnt:\t%u\n" + "pid:\t%u\n" + "path:\t%s\n", + umulti_link->cnt, pid, p); + + seq_printf(seq, "%s\t %s\t %s\n", "cookie", "offset", "ref_ctr_offset"); + for (int i = 0; i < umulti_link->cnt; i++) { + seq_printf(seq, + "%llu\t %#llx\t %#lx\n", + umulti_link->uprobes[i].cookie, + umulti_link->uprobes[i].offset, + umulti_link->uprobes[i].ref_ctr_offset); + } + + kfree(buf); +} +#endif + static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { .release = bpf_uprobe_multi_link_release, .dealloc_deferred = bpf_uprobe_multi_link_dealloc, .fill_link_info = bpf_uprobe_multi_link_fill_link_info, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_uprobe_multi_show_fdinfo, +#endif }; static int uprobe_prog_run(struct bpf_uprobe *uprobe, @@ -3376,6 +3342,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; + if (attr->link_create.flags) + return -EINVAL; + if (!is_uprobe_multi(prog)) return -EINVAL; @@ -3417,7 +3386,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr } if (pid) { + rcu_read_lock(); task = get_pid_task(find_vpid(pid), PIDTYPE_TGID); + rcu_read_unlock(); if (!task) { err = -ESRCH; goto error_path_put; @@ -3466,10 +3437,10 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr link->uprobes = uprobes; link->path = path; link->task = task; - link->flags = flags; + link->link.flags = flags; bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI, - &bpf_uprobe_multi_link_lops, prog); + &bpf_uprobe_multi_link_lops, prog, attr->link_create.attach_type); for (i = 0; i < cnt; i++) { uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry), @@ -3565,6 +3536,146 @@ static int __init bpf_kprobe_multi_kfuncs_init(void) late_initcall(bpf_kprobe_multi_kfuncs_init); +typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struct *tsk); + +/* + * The __always_inline is to make sure the compiler doesn't + * generate indirect calls into callbacks, which is expensive, + * on some kernel configurations. This allows compiler to put + * direct calls into all the specific callback implementations + * (copy_user_data_sleepable, copy_user_data_nofault, and so on) + */ +static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 doff, u32 size, + const void *unsafe_src, + copy_fn_t str_copy_fn, + struct task_struct *tsk) +{ + struct bpf_dynptr_kern *dst; + u32 chunk_sz, off; + void *dst_slice; + int cnt, err; + char buf[256]; + + dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size); + if (likely(dst_slice)) + return str_copy_fn(dst_slice, unsafe_src, size, tsk); + + dst = (struct bpf_dynptr_kern *)dptr; + if (bpf_dynptr_check_off_len(dst, doff, size)) + return -E2BIG; + + for (off = 0; off < size; off += chunk_sz - 1) { + chunk_sz = min_t(u32, sizeof(buf), size - off); + /* Expect str_copy_fn to return count of copied bytes, including + * zero terminator. Next iteration increment off by chunk_sz - 1 to + * overwrite NUL. + */ + cnt = str_copy_fn(buf, unsafe_src + off, chunk_sz, tsk); + if (cnt < 0) + return cnt; + err = __bpf_dynptr_write(dst, doff + off, buf, cnt, 0); + if (err) + return err; + if (cnt < chunk_sz || chunk_sz == 1) /* we are done */ + return off + cnt; + } + return off; +} + +static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32 doff, + u32 size, const void *unsafe_src, + copy_fn_t copy_fn, struct task_struct *tsk) +{ + struct bpf_dynptr_kern *dst; + void *dst_slice; + char buf[256]; + u32 off, chunk_sz; + int err; + + dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size); + if (likely(dst_slice)) + return copy_fn(dst_slice, unsafe_src, size, tsk); + + dst = (struct bpf_dynptr_kern *)dptr; + if (bpf_dynptr_check_off_len(dst, doff, size)) + return -E2BIG; + + for (off = 0; off < size; off += chunk_sz) { + chunk_sz = min_t(u32, sizeof(buf), size - off); + err = copy_fn(buf, unsafe_src + off, chunk_sz, tsk); + if (err) + return err; + err = __bpf_dynptr_write(dst, doff + off, buf, chunk_sz, 0); + if (err) + return err; + } + return 0; +} + +static __always_inline int copy_user_data_nofault(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + return copy_from_user_nofault(dst, (const void __user *)unsafe_src, size); +} + +static __always_inline int copy_user_data_sleepable(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + int ret; + + if (!tsk) { /* Read from the current task */ + ret = copy_from_user(dst, (const void __user *)unsafe_src, size); + if (ret) + return -EFAULT; + return 0; + } + + ret = access_process_vm(tsk, (unsigned long)unsafe_src, dst, size, 0); + if (ret != size) + return -EFAULT; + return 0; +} + +static __always_inline int copy_kernel_data_nofault(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + return copy_from_kernel_nofault(dst, unsafe_src, size); +} + +static __always_inline int copy_user_str_nofault(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + return strncpy_from_user_nofault(dst, (const void __user *)unsafe_src, size); +} + +static __always_inline int copy_user_str_sleepable(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + int ret; + + if (unlikely(size == 0)) + return 0; + + if (tsk) { + ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_src, dst, size, 0); + } else { + ret = strncpy_from_user(dst, (const void __user *)unsafe_src, size - 1); + /* strncpy_from_user does not guarantee NUL termination */ + if (ret >= 0) + ((char *)dst)[ret] = '\0'; + } + + if (ret < 0) + return ret; + return ret + 1; +} + +static __always_inline int copy_kernel_str_nofault(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + return strncpy_from_kernel_nofault(dst, unsafe_src, size); +} + __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid_type type, @@ -3576,4 +3687,62 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid return bpf_send_signal_common(sig, type, task, value); } +__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void __user *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_data_nofault, NULL); +} + +__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign, + copy_kernel_data_nofault, NULL); +} + +__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void __user *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_str_nofault, NULL); +} + +__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign, + copy_kernel_str_nofault, NULL); +} + +__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void __user *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_data_sleepable, NULL); +} + +__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void __user *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_str_sleepable, NULL); +} + +__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void __user *unsafe_ptr__ign, + struct task_struct *tsk) +{ + return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_data_sleepable, tsk); +} + +__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void __user *unsafe_ptr__ign, + struct task_struct *tsk) +{ + return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_str_sleepable, tsk); +} + __bpf_kfunc_end_defs(); diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 8d925cbdce3a..f4d200f0c610 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1325,6 +1325,10 @@ int register_ftrace_graph(struct fgraph_ops *gops) int ret = 0; int i = -1; + if (WARN_ONCE(gops->ops.flags & FTRACE_OPS_FL_GRAPH, + "function graph ops registered again")) + return -EBUSY; + guard(mutex)(&ftrace_lock); if (!fgraph_stack_cachep) { @@ -1382,6 +1386,8 @@ int register_ftrace_graph(struct fgraph_ops *gops) /* Always save the function, and reset at unregistering */ gops->saved_func = gops->entryfunc; + gops->ops.flags |= FTRACE_OPS_FL_GRAPH; + ret = ftrace_startup_subops(&graph_ops, &gops->ops, command); if (!ret) fgraph_array[i] = gops; @@ -1399,17 +1405,21 @@ void unregister_ftrace_graph(struct fgraph_ops *gops) { int command = 0; + if (WARN_ONCE(!(gops->ops.flags & FTRACE_OPS_FL_GRAPH), + "function graph ops unregistered without registering")) + return; + guard(mutex)(&ftrace_lock); if (unlikely(!ftrace_graph_active)) - return; + goto out; if (unlikely(gops->idx < 0 || gops->idx >= FGRAPH_ARRAY_SIZE || fgraph_array[gops->idx] != gops)) - return; + goto out; if (fgraph_lru_release_index(gops->idx) < 0) - return; + goto out; fgraph_array[gops->idx] = &fgraph_stub; @@ -1432,4 +1442,6 @@ void unregister_ftrace_graph(struct fgraph_ops *gops) unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); } gops->saved_func = NULL; + out: + gops->ops.flags &= ~FTRACE_OPS_FL_GRAPH; } diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index ba7ff14f5339..c8034dfc1070 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -352,7 +352,7 @@ static void fprobe_return(struct ftrace_graph_ret *trace, size_words = SIZE_IN_LONG(size); ret_ip = ftrace_regs_get_instruction_pointer(fregs); - preempt_disable(); + preempt_disable_notrace(); curr = 0; while (size_words > curr) { @@ -368,7 +368,7 @@ static void fprobe_return(struct ftrace_graph_ret *trace, } curr += size; } - preempt_enable(); + preempt_enable_notrace(); } NOKPROBE_SYMBOL(fprobe_return); @@ -648,6 +648,11 @@ static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num) #define FPROBE_IPS_MAX INT_MAX +int fprobe_count_ips_from_filter(const char *filter, const char *notfilter) +{ + return get_ips_from_filter(filter, notfilter, NULL, NULL, FPROBE_IPS_MAX); +} + /** * register_fprobe() - Register fprobe to ftrace by pattern. * @fp: A fprobe data structure to be registered. diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6981830c3128..00b76d450a89 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -188,7 +188,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, op->saved_func(ip, parent_ip, op, fregs); } -static void ftrace_sync_ipi(void *data) +void ftrace_sync_ipi(void *data) { /* Probably not needed, but do it anyway */ smp_rmb(); @@ -1042,10 +1042,6 @@ static struct ftrace_ops *removed_ops; */ static bool update_all_ops; -#ifndef CONFIG_FTRACE_MCOUNT_RECORD -# error Dynamic ftrace depends on MCOUNT_RECORD -#endif - struct ftrace_func_probe { struct ftrace_probe_ops *probe_ops; struct ftrace_ops ops; @@ -4373,6 +4369,42 @@ static inline int print_rec(struct seq_file *m, unsigned long ip) } #endif +static void print_subops(struct seq_file *m, struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + struct ftrace_ops *subops; + bool first = true; + + list_for_each_entry(subops, &ops->subop_list, list) { + if (!((subops->flags & FTRACE_OPS_FL_ENABLED) && + hash_contains_ip(rec->ip, subops->func_hash))) + continue; + if (first) { + seq_printf(m, "\tsubops:"); + first = false; + } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (subops->flags & FTRACE_OPS_FL_GRAPH) { + struct fgraph_ops *gops; + + gops = container_of(subops, struct fgraph_ops, ops); + seq_printf(m, " {ent:%pS ret:%pS}", + (void *)gops->entryfunc, + (void *)gops->retfunc); + continue; + } +#endif + if (subops->trampoline) { + seq_printf(m, " {%pS (%pS)}", + (void *)subops->trampoline, + (void *)subops->func); + add_trampoline_func(m, subops, rec); + } else { + seq_printf(m, " {%pS}", + (void *)subops->func); + } + } +} + static int t_show(struct seq_file *m, void *v) { struct ftrace_iterator *iter = m->private; @@ -4425,6 +4457,7 @@ static int t_show(struct seq_file *m, void *v) (void *)ops->trampoline, (void *)ops->func); add_trampoline_func(m, ops, rec); + print_subops(m, ops, rec); ops = ftrace_find_tramp_ops_next(rec, ops); } while (ops); } else @@ -4437,6 +4470,7 @@ static int t_show(struct seq_file *m, void *v) if (ops) { seq_printf(m, "\tops: %pS (%pS)", ops, ops->func); + print_subops(m, ops, rec); } else { seq_puts(m, "\tops: ERROR!"); } @@ -5170,8 +5204,12 @@ struct ftrace_func_map { void *data; }; +/* + * Note, ftrace_func_mapper is freed by free_ftrace_hash(&mapper->hash). + * The hash field must be the first field. + */ struct ftrace_func_mapper { - struct ftrace_hash hash; + struct ftrace_hash hash; /* Must be first! */ }; /** @@ -5306,6 +5344,7 @@ void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, } } } + /* This also frees the mapper itself */ free_ftrace_hash(&mapper->hash); } @@ -7395,9 +7434,10 @@ void ftrace_release_mod(struct module *mod) mutex_lock(&ftrace_lock); - if (ftrace_disabled) - goto out_unlock; - + /* + * To avoid the UAF problem after the module is unloaded, the + * 'mod_map' resource needs to be released unconditionally. + */ list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) { if (mod_map->mod == mod) { list_del_rcu(&mod_map->list); @@ -7406,6 +7446,9 @@ void ftrace_release_mod(struct module *mod) } } + if (ftrace_disabled) + goto out_unlock; + /* * Each module has its own ftrace_pages, remove * them from the list. @@ -7584,6 +7627,9 @@ allocate_ftrace_mod_map(struct module *mod, { struct ftrace_mod_map *mod_map; + if (ftrace_disabled) + return NULL; + mod_map = kmalloc(sizeof(*mod_map), GFP_KERNEL); if (!mod_map) return NULL; diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index c62b9b3cfb3d..090bb5ea4a19 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -81,13 +81,9 @@ static inline bool upper_empty(union upper_chunk *chunk) { /* * If chunk->data has no lower chunks, it will be the same - * as a zeroed bitmask. Use find_first_bit() to test it - * and if it doesn't find any bits set, then the array - * is empty. + * as a zeroed bitmask. */ - int bit = find_first_bit((unsigned long *)chunk->data, - sizeof(chunk->data) * 8); - return bit >= sizeof(chunk->data) * 8; + return bitmap_empty((unsigned long *)chunk->data, BITS_PER_TYPE(chunk->data)); } static inline int pid_split(unsigned int pid, unsigned int *upper1, diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index 21bb161c2316..f2fe33573e54 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -17,5 +17,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_frequency); -EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3f9bf562beea..5176e0270f07 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1358,6 +1358,13 @@ static inline void rb_inc_page(struct buffer_page **bpage) *bpage = list_entry(p, struct buffer_page, list); } +static inline void rb_dec_page(struct buffer_page **bpage) +{ + struct list_head *p = rb_list_head((*bpage)->list.prev); + + *bpage = list_entry(p, struct buffer_page, list); +} + static struct buffer_page * rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer) { @@ -1866,10 +1873,11 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu) static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) { struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; - struct buffer_page *head_page; + struct buffer_page *head_page, *orig_head; unsigned long entry_bytes = 0; unsigned long entries = 0; int ret; + u64 ts; int i; if (!meta || !meta->head_buffer) @@ -1885,8 +1893,98 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) entry_bytes += local_read(&cpu_buffer->reader_page->page->commit); local_set(&cpu_buffer->reader_page->entries, ret); - head_page = cpu_buffer->head_page; + orig_head = head_page = cpu_buffer->head_page; + ts = head_page->page->time_stamp; + + /* + * Try to rewind the head so that we can read the pages which already + * read in the previous boot. + */ + if (head_page == cpu_buffer->tail_page) + goto skip_rewind; + + rb_dec_page(&head_page); + for (i = 0; i < meta->nr_subbufs + 1; i++, rb_dec_page(&head_page)) { + + /* Rewind until tail (writer) page. */ + if (head_page == cpu_buffer->tail_page) + break; + + /* Ensure the page has older data than head. */ + if (ts < head_page->page->time_stamp) + break; + + ts = head_page->page->time_stamp; + /* Ensure the page has correct timestamp and some data. */ + if (!ts || rb_page_commit(head_page) == 0) + break; + + /* Stop rewind if the page is invalid. */ + ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu); + if (ret < 0) + break; + + /* Recover the number of entries and update stats. */ + local_set(&head_page->entries, ret); + if (ret) + local_inc(&cpu_buffer->pages_touched); + entries += ret; + entry_bytes += rb_page_commit(head_page); + } + if (i) + pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i); + + /* The last rewound page must be skipped. */ + if (head_page != orig_head) + rb_inc_page(&head_page); + + /* + * If the ring buffer was rewound, then inject the reader page + * into the location just before the original head page. + */ + if (head_page != orig_head) { + struct buffer_page *bpage = orig_head; + + rb_dec_page(&bpage); + /* + * Insert the reader_page before the original head page. + * Since the list encode RB_PAGE flags, general list + * operations should be avoided. + */ + cpu_buffer->reader_page->list.next = &orig_head->list; + cpu_buffer->reader_page->list.prev = orig_head->list.prev; + orig_head->list.prev = &cpu_buffer->reader_page->list; + bpage->list.next = &cpu_buffer->reader_page->list; + + /* Make the head_page the reader page */ + cpu_buffer->reader_page = head_page; + bpage = head_page; + rb_inc_page(&head_page); + head_page->list.prev = bpage->list.prev; + rb_dec_page(&bpage); + bpage->list.next = &head_page->list; + rb_set_list_to_head(&bpage->list); + cpu_buffer->pages = &head_page->list; + + cpu_buffer->head_page = head_page; + meta->head_buffer = (unsigned long)head_page->page; + + /* Reset all the indexes */ + bpage = cpu_buffer->reader_page; + meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page); + bpage->id = 0; + + for (i = 1, bpage = head_page; i < meta->nr_subbufs; + i++, rb_inc_page(&bpage)) { + meta->buffers[i] = rb_meta_subbuf_idx(meta, bpage->page); + bpage->id = i; + } + + /* We'll restart verifying from orig_head */ + head_page = orig_head; + } + skip_rewind: /* If the commit_buffer is the reader page, update the commit page */ if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) { cpu_buffer->commit_page = cpu_buffer->reader_page; @@ -2226,7 +2324,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) { - struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_per_cpu *cpu_buffer __free(kfree) = NULL; struct ring_buffer_cpu_meta *meta; struct buffer_page *bpage; struct page *page; @@ -2252,7 +2350,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); if (!bpage) - goto fail_free_buffer; + return NULL; rb_check_bpage(cpu_buffer, bpage); @@ -2318,13 +2416,11 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) rb_head_page_activate(cpu_buffer); } - return cpu_buffer; + return_ptr(cpu_buffer); fail_free_reader: free_buffer_page(cpu_buffer->reader_page); - fail_free_buffer: - kfree(cpu_buffer); return NULL; } @@ -2359,7 +2455,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, unsigned long scratch_size, struct lock_class_key *key) { - struct trace_buffer *buffer; + struct trace_buffer *buffer __free(kfree) = NULL; long nr_pages; int subbuf_size; int bsize; @@ -2373,7 +2469,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, return NULL; if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) - goto fail_free_buffer; + return NULL; buffer->subbuf_order = order; subbuf_size = (PAGE_SIZE << order); @@ -2472,7 +2568,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, mutex_init(&buffer->mutex); - return buffer; + return_ptr(buffer); fail_free_buffers: for_each_buffer_cpu(buffer, cpu) { @@ -2484,8 +2580,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, fail_free_cpumask: free_cpumask_var(buffer->cpumask); - fail_free_buffer: - kfree(buffer); return NULL; } @@ -2849,6 +2943,12 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, if (nr_pages < 2) nr_pages = 2; + /* + * Keep CPUs from coming online while resizing to synchronize + * with new per CPU buffers being created. + */ + guard(cpus_read_lock)(); + /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); atomic_inc(&buffer->resizing); @@ -2893,7 +2993,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, cond_resched(); } - cpus_read_lock(); /* * Fire off all the required work handlers * We can't schedule on offline CPUs, but it's not necessary @@ -2933,7 +3032,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, cpu_buffer->nr_pages_to_update = 0; } - cpus_read_unlock(); } else { cpu_buffer = buffer->buffers[cpu_id]; @@ -2961,8 +3059,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, goto out_err; } - cpus_read_lock(); - /* Can't run something on an offline CPU. */ if (!cpu_online(cpu_id)) rb_update_pages(cpu_buffer); @@ -2981,7 +3077,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, } cpu_buffer->nr_pages_to_update = 0; - cpus_read_unlock(); } out: @@ -4121,7 +4216,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); static const char *show_irq_str(int bits) { - const char *type[] = { + static const char * type[] = { ".", // 0 "s", // 1 "h", // 2 @@ -4684,10 +4779,7 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer, RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); rb_decrement_entry(cpu_buffer, event); - if (rb_try_to_discard(cpu_buffer, event)) - goto out; - - out: + rb_try_to_discard(cpu_buffer, event); rb_end_commit(cpu_buffer); trace_recursive_unlock(cpu_buffer); @@ -4885,6 +4977,24 @@ bool ring_buffer_record_is_set_on(struct trace_buffer *buffer) } /** + * ring_buffer_record_is_on_cpu - return true if the ring buffer can write + * @buffer: The ring buffer to see if write is enabled + * @cpu: The CPU to test if the ring buffer can write too + * + * Returns true if the ring buffer is in a state that it accepts writes + * for a particular CPU. + */ +bool ring_buffer_record_is_on_cpu(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + cpu_buffer = buffer->buffers[cpu]; + + return ring_buffer_record_is_set_on(buffer) && + !atomic_read(&cpu_buffer->record_disabled); +} + +/** * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer * @buffer: The ring buffer to stop writes to. * @cpu: The CPU buffer to stop @@ -5330,7 +5440,6 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) */ local_set(&cpu_buffer->reader_page->write, 0); local_set(&cpu_buffer->reader_page->entries, 0); - local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->real_end = 0; spin: @@ -5834,24 +5943,20 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts, EXPORT_SYMBOL_GPL(ring_buffer_consume); /** - * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer + * ring_buffer_read_start - start a non consuming read of the buffer * @buffer: The ring buffer to read from * @cpu: The cpu buffer to iterate over * @flags: gfp flags to use for memory allocation * - * This performs the initial preparations necessary to iterate - * through the buffer. Memory is allocated, buffer resizing - * is disabled, and the iterator pointer is returned to the caller. - * - * After a sequence of ring_buffer_read_prepare calls, the user is - * expected to make at least one call to ring_buffer_read_prepare_sync. - * Afterwards, ring_buffer_read_start is invoked to get things going - * for real. + * This creates an iterator to allow non-consuming iteration through + * the buffer. If the buffer is disabled for writing, it will produce + * the same information each time, but if the buffer is still writing + * then the first hit of a write will cause the iteration to stop. * - * This overall must be paired with ring_buffer_read_finish. + * Must be paired with ring_buffer_read_finish. */ struct ring_buffer_iter * -ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) +ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; @@ -5877,51 +5982,12 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) atomic_inc(&cpu_buffer->resize_disabled); - return iter; -} -EXPORT_SYMBOL_GPL(ring_buffer_read_prepare); - -/** - * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls - * - * All previously invoked ring_buffer_read_prepare calls to prepare - * iterators will be synchronized. Afterwards, read_buffer_read_start - * calls on those iterators are allowed. - */ -void -ring_buffer_read_prepare_sync(void) -{ - synchronize_rcu(); -} -EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); - -/** - * ring_buffer_read_start - start a non consuming read of the buffer - * @iter: The iterator returned by ring_buffer_read_prepare - * - * This finalizes the startup of an iteration through the buffer. - * The iterator comes from a call to ring_buffer_read_prepare and - * an intervening ring_buffer_read_prepare_sync must have been - * performed. - * - * Must be paired with ring_buffer_read_finish. - */ -void -ring_buffer_read_start(struct ring_buffer_iter *iter) -{ - struct ring_buffer_per_cpu *cpu_buffer; - unsigned long flags; - - if (!iter) - return; - - cpu_buffer = iter->cpu_buffer; - - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); arch_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); arch_spin_unlock(&cpu_buffer->lock); - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return iter; } EXPORT_SYMBOL_GPL(ring_buffer_read_start); @@ -6002,6 +6068,39 @@ static void rb_clear_buffer_page(struct buffer_page *page) page->read = 0; } +/* + * When the buffer is memory mapped to user space, each sub buffer + * has a unique id that is used by the meta data to tell the user + * where the current reader page is. + * + * For a normal allocated ring buffer, the id is saved in the buffer page + * id field, and updated via this function. + * + * But for a fixed memory mapped buffer, the id is already assigned for + * fixed memory ording in the memory layout and can not be used. Instead + * the index of where the page lies in the memory layout is used. + * + * For the normal pages, set the buffer page id with the passed in @id + * value and return that. + * + * For fixed memory mapped pages, get the page index in the memory layout + * and return that as the id. + */ +static int rb_page_id(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *bpage, int id) +{ + /* + * For boot buffers, the id is the index, + * otherwise, set the buffer page with this id + */ + if (cpu_buffer->ring_meta) + id = rb_meta_subbuf_idx(cpu_buffer->ring_meta, bpage->page); + else + bpage->id = id; + + return id; +} + static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) { struct trace_buffer_meta *meta = cpu_buffer->meta_page; @@ -6010,7 +6109,9 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) return; meta->reader.read = cpu_buffer->reader_page->read; - meta->reader.id = cpu_buffer->reader_page->id; + meta->reader.id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, + cpu_buffer->reader_page->id); + meta->reader.lost_events = cpu_buffer->lost_events; meta->entries = local_read(&cpu_buffer->entries); @@ -6080,21 +6181,16 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) /* Must have disabled the cpu buffer then done a synchronize_rcu */ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { - unsigned long flags; - - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) - goto out; + return; arch_spin_lock(&cpu_buffer->lock); rb_reset_cpu(cpu_buffer); arch_spin_unlock(&cpu_buffer->lock); - - out: - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } /** @@ -6282,37 +6378,33 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || !cpumask_test_cpu(cpu, buffer_b->cpumask)) - goto out; + return -EINVAL; cpu_buffer_a = buffer_a->buffers[cpu]; cpu_buffer_b = buffer_b->buffers[cpu]; /* It's up to the callers to not try to swap mapped buffers */ - if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) { - ret = -EBUSY; - goto out; - } + if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) + return -EBUSY; /* At least make sure the two buffers are somewhat the same */ if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) - goto out; + return -EINVAL; if (buffer_a->subbuf_order != buffer_b->subbuf_order) - goto out; - - ret = -EAGAIN; + return -EINVAL; if (atomic_read(&buffer_a->record_disabled)) - goto out; + return -EAGAIN; if (atomic_read(&buffer_b->record_disabled)) - goto out; + return -EAGAIN; if (atomic_read(&cpu_buffer_a->record_disabled)) - goto out; + return -EAGAIN; if (atomic_read(&cpu_buffer_b->record_disabled)) - goto out; + return -EAGAIN; /* * We can't do a synchronize_rcu here because this @@ -6349,7 +6441,6 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, out_dec: atomic_dec(&cpu_buffer_a->record_disabled); atomic_dec(&cpu_buffer_b->record_disabled); -out: return ret; } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); @@ -6508,38 +6599,37 @@ int ring_buffer_read_page(struct trace_buffer *buffer, struct buffer_data_page *bpage; struct buffer_page *reader; unsigned long missed_events; - unsigned long flags; unsigned int commit; unsigned int read; u64 save_timestamp; - int ret = -1; if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return -1; /* * If len is not big enough to hold the page header, then * we can not copy anything. */ if (len <= BUF_PAGE_HDR_SIZE) - goto out; + return -1; len -= BUF_PAGE_HDR_SIZE; if (!data_page || !data_page->data) - goto out; + return -1; + if (data_page->order != buffer->subbuf_order) - goto out; + return -1; bpage = data_page->data; if (!bpage) - goto out; + return -1; - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); reader = rb_get_reader_page(cpu_buffer); if (!reader) - goto out_unlock; + return -1; event = rb_reader_event(cpu_buffer); @@ -6573,7 +6663,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, if (full && (!read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page)) - goto out_unlock; + return -1; if (len > (commit - read)) len = (commit - read); @@ -6582,7 +6672,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, size = rb_event_ts_length(event); if (len < size) - goto out_unlock; + return -1; /* save the current timestamp, since the user will need it */ save_timestamp = cpu_buffer->read_stamp; @@ -6640,7 +6730,6 @@ int ring_buffer_read_page(struct trace_buffer *buffer, if (reader->real_end) local_set(&bpage->commit, reader->real_end); } - ret = read; cpu_buffer->lost_events = 0; @@ -6667,11 +6756,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, if (commit < buffer->subbuf_size) memset(&bpage->data[commit], 0, buffer->subbuf_size - commit); - out_unlock: - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - - out: - return ret; + return read; } EXPORT_SYMBOL_GPL(ring_buffer_read_page); @@ -6764,7 +6849,7 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) old_size = buffer->subbuf_size; /* prevent another thread from changing buffer sizes */ - mutex_lock(&buffer->mutex); + guard(mutex)(&buffer->mutex); atomic_inc(&buffer->record_disabled); /* Make sure all commits have finished */ @@ -6869,7 +6954,6 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) } atomic_dec(&buffer->record_disabled); - mutex_unlock(&buffer->mutex); return 0; @@ -6878,7 +6962,6 @@ error: buffer->subbuf_size = old_size; atomic_dec(&buffer->record_disabled); - mutex_unlock(&buffer->mutex); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; @@ -6926,23 +7009,29 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, struct trace_buffer_meta *meta = cpu_buffer->meta_page; unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; struct buffer_page *first_subbuf, *subbuf; + int cnt = 0; int id = 0; - subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; - cpu_buffer->reader_page->id = id++; + id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, id); + subbuf_ids[id++] = (unsigned long)cpu_buffer->reader_page->page; + cnt++; first_subbuf = subbuf = rb_set_head_page(cpu_buffer); do { + id = rb_page_id(cpu_buffer, subbuf, id); + if (WARN_ON(id >= nr_subbufs)) break; subbuf_ids[id] = (unsigned long)subbuf->page; - subbuf->id = id; rb_inc_page(&subbuf); id++; + cnt++; } while (subbuf != first_subbuf); + WARN_ON(cnt != nr_subbufs); + /* install subbuf ID to kern VA translation */ cpu_buffer->subbuf_ids = subbuf_ids; @@ -7034,7 +7123,7 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, { unsigned long nr_subbufs, nr_pages, nr_vma_pages, pgoff = vma->vm_pgoff; unsigned int subbuf_pages, subbuf_order; - struct page **pages; + struct page **pages __free(kfree) = NULL; int p = 0, s = 0; int err; @@ -7102,10 +7191,8 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, struct page *page; int off = 0; - if (WARN_ON_ONCE(s >= nr_subbufs)) { - err = -EINVAL; - goto out; - } + if (WARN_ON_ONCE(s >= nr_subbufs)) + return -EINVAL; page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); @@ -7120,9 +7207,6 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); -out: - kfree(pages); - return err; } #else @@ -7138,36 +7222,34 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags, *subbuf_ids; - int err = 0; + int err; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; - mutex_lock(&cpu_buffer->mapping_lock); + guard(mutex)(&cpu_buffer->mapping_lock); if (cpu_buffer->user_mapped) { err = __rb_map_vma(cpu_buffer, vma); if (!err) err = __rb_inc_dec_mapped(cpu_buffer, true); - mutex_unlock(&cpu_buffer->mapping_lock); return err; } /* prevent another thread from changing buffer/sub-buffer sizes */ - mutex_lock(&buffer->mutex); + guard(mutex)(&buffer->mutex); err = rb_alloc_meta_page(cpu_buffer); if (err) - goto unlock; + return err; /* subbuf_ids include the reader while nr_pages does not */ subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL); if (!subbuf_ids) { rb_free_meta_page(cpu_buffer); - err = -ENOMEM; - goto unlock; + return -ENOMEM; } atomic_inc(&cpu_buffer->resize_disabled); @@ -7195,35 +7277,29 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, atomic_dec(&cpu_buffer->resize_disabled); } -unlock: - mutex_unlock(&buffer->mutex); - mutex_unlock(&cpu_buffer->mapping_lock); - - return err; + return 0; } int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; - int err = 0; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; - mutex_lock(&cpu_buffer->mapping_lock); + guard(mutex)(&cpu_buffer->mapping_lock); if (!cpu_buffer->user_mapped) { - err = -ENODEV; - goto out; + return -ENODEV; } else if (cpu_buffer->user_mapped > 1) { __rb_inc_dec_mapped(cpu_buffer, false); - goto out; + return 0; } - mutex_lock(&buffer->mutex); + guard(mutex)(&buffer->mutex); raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* This is the last user space mapping */ @@ -7238,12 +7314,7 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) rb_free_meta_page(cpu_buffer); atomic_dec(&cpu_buffer->resize_disabled); - mutex_unlock(&buffer->mutex); - -out: - mutex_unlock(&cpu_buffer->mapping_lock); - - return err; + return 0; } int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu) @@ -7284,8 +7355,8 @@ consume: /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; - if (cpu_buffer->reader_page != cpu_buffer->commit_page) { - if (missed_events) { + if (missed_events) { + if (cpu_buffer->reader_page != cpu_buffer->commit_page) { struct buffer_data_page *bpage = reader->page; unsigned int commit; /* @@ -7306,13 +7377,23 @@ consume: local_add(RB_MISSED_STORED, &bpage->commit); } local_add(RB_MISSED_EVENTS, &bpage->commit); + } else if (!WARN_ONCE(cpu_buffer->reader_page == cpu_buffer->tail_page, + "Reader on commit with %ld missed events", + missed_events)) { + /* + * There shouldn't be any missed events if the tail_page + * is on the reader page. But if the tail page is not on the + * reader page and the commit_page is, that would mean that + * there's a commit_overrun (an interrupt preempted an + * addition of an event and then filled the buffer + * with new events). In this case it's not an + * error, but it should still be reported. + * + * TODO: Add missed events to the page for user space to know. + */ + pr_info("Ring buffer [%d] commit overrun lost %ld events at timestamp:%lld\n", + cpu, missed_events, cpu_buffer->reader_page->page->time_stamp); } - } else { - /* - * There really shouldn't be any missed events if the commit - * is on the reader page. - */ - WARN_ON_ONCE(missed_events); } cpu_buffer->lost_events = 0; diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index b39f36013ef2..5b4be87ba59d 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -1,19 +1,31 @@ # SPDX-License-Identifier: GPL-2.0-only # -config DA_MON_EVENTS +config RV_MON_EVENTS + bool + +config RV_MON_MAINTENANCE_EVENTS bool config DA_MON_EVENTS_IMPLICIT - select DA_MON_EVENTS + select RV_MON_EVENTS + select RV_MON_MAINTENANCE_EVENTS bool config DA_MON_EVENTS_ID - select DA_MON_EVENTS + select RV_MON_EVENTS + select RV_MON_MAINTENANCE_EVENTS + bool + +config LTL_MON_EVENTS_ID + select RV_MON_EVENTS + bool + +config RV_LTL_MONITOR bool menuconfig RV bool "Runtime Verification" - depends on TRACING + select TRACING help Enable the kernel runtime verification infrastructure. RV is a lightweight (yet rigorous) method that complements classical @@ -25,15 +37,34 @@ menuconfig RV For further information, see: Documentation/trace/rv/runtime-verification.rst +config RV_PER_TASK_MONITORS + int "Maximum number of per-task monitor" + depends on RV + range 1 8 + default 2 + help + This option configures the maximum number of per-task RV monitors that can run + simultaneously. + source "kernel/trace/rv/monitors/wip/Kconfig" source "kernel/trace/rv/monitors/wwnr/Kconfig" + source "kernel/trace/rv/monitors/sched/Kconfig" -source "kernel/trace/rv/monitors/tss/Kconfig" source "kernel/trace/rv/monitors/sco/Kconfig" source "kernel/trace/rv/monitors/snroc/Kconfig" source "kernel/trace/rv/monitors/scpd/Kconfig" source "kernel/trace/rv/monitors/snep/Kconfig" -source "kernel/trace/rv/monitors/sncid/Kconfig" +source "kernel/trace/rv/monitors/sts/Kconfig" +source "kernel/trace/rv/monitors/nrp/Kconfig" +source "kernel/trace/rv/monitors/sssw/Kconfig" +source "kernel/trace/rv/monitors/opid/Kconfig" +# Add new sched monitors here + +source "kernel/trace/rv/monitors/rtapp/Kconfig" +source "kernel/trace/rv/monitors/pagefault/Kconfig" +source "kernel/trace/rv/monitors/sleep/Kconfig" +# Add new rtapp monitors here + # Add new monitors here config RV_REACTORS diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index f9b2cd0483c3..750e4ad6fa0f 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -6,12 +6,17 @@ obj-$(CONFIG_RV) += rv.o obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o obj-$(CONFIG_RV_MON_SCHED) += monitors/sched/sched.o -obj-$(CONFIG_RV_MON_TSS) += monitors/tss/tss.o obj-$(CONFIG_RV_MON_SCO) += monitors/sco/sco.o obj-$(CONFIG_RV_MON_SNROC) += monitors/snroc/snroc.o obj-$(CONFIG_RV_MON_SCPD) += monitors/scpd/scpd.o obj-$(CONFIG_RV_MON_SNEP) += monitors/snep/snep.o -obj-$(CONFIG_RV_MON_SNCID) += monitors/sncid/sncid.o +obj-$(CONFIG_RV_MON_RTAPP) += monitors/rtapp/rtapp.o +obj-$(CONFIG_RV_MON_PAGEFAULT) += monitors/pagefault/pagefault.o +obj-$(CONFIG_RV_MON_SLEEP) += monitors/sleep/sleep.o +obj-$(CONFIG_RV_MON_STS) += monitors/sts/sts.o +obj-$(CONFIG_RV_MON_NRP) += monitors/nrp/nrp.o +obj-$(CONFIG_RV_MON_SSSW) += monitors/sssw/sssw.o +obj-$(CONFIG_RV_MON_OPID) += monitors/opid/opid.o # Add new monitors here obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o diff --git a/kernel/trace/rv/monitors/tss/Kconfig b/kernel/trace/rv/monitors/nrp/Kconfig index 479f86f52e60..f5ec08f65535 100644 --- a/kernel/trace/rv/monitors/tss/Kconfig +++ b/kernel/trace/rv/monitors/nrp/Kconfig @@ -1,14 +1,16 @@ # SPDX-License-Identifier: GPL-2.0-only # -config RV_MON_TSS +config RV_MON_NRP depends on RV depends on RV_MON_SCHED - default y - select DA_MON_EVENTS_IMPLICIT - bool "tss monitor" + default y if !ARM64 + select DA_MON_EVENTS_ID + bool "nrp monitor" help - Monitor to ensure sched_switch happens only in scheduling context. + Monitor to ensure preemption requires need resched. This monitor is part of the sched monitors collection. + This monitor is unstable on arm64, say N unless you are testing it. + For further information, see: Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/nrp/nrp.c b/kernel/trace/rv/monitors/nrp/nrp.c new file mode 100644 index 000000000000..5a83b7171432 --- /dev/null +++ b/kernel/trace/rv/monitors/nrp/nrp.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "nrp" + +#include <trace/events/irq.h> +#include <trace/events/sched.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "nrp.h" + +static struct rv_monitor rv_nrp; +DECLARE_DA_MON_PER_TASK(nrp, unsigned char); + +#ifdef CONFIG_X86_LOCAL_APIC +#include <asm/trace/irq_vectors.h> + +static void handle_vector_irq_entry(void *data, int vector) +{ + da_handle_event_nrp(current, irq_entry_nrp); +} + +static void attach_vector_irq(void) +{ + rv_attach_trace_probe("nrp", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_attach_trace_probe("nrp", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_attach_trace_probe("nrp", reschedule_entry, handle_vector_irq_entry); + rv_attach_trace_probe("nrp", call_function_entry, handle_vector_irq_entry); + rv_attach_trace_probe("nrp", call_function_single_entry, handle_vector_irq_entry); + } +} + +static void detach_vector_irq(void) +{ + rv_detach_trace_probe("nrp", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_detach_trace_probe("nrp", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_detach_trace_probe("nrp", reschedule_entry, handle_vector_irq_entry); + rv_detach_trace_probe("nrp", call_function_entry, handle_vector_irq_entry); + rv_detach_trace_probe("nrp", call_function_single_entry, handle_vector_irq_entry); + } +} + +#else +/* We assume irq_entry tracepoints are sufficient on other architectures */ +static void attach_vector_irq(void) { } +static void detach_vector_irq(void) { } +#endif + +static void handle_irq_entry(void *data, int irq, struct irqaction *action) +{ + da_handle_event_nrp(current, irq_entry_nrp); +} + +static void handle_sched_need_resched(void *data, struct task_struct *tsk, + int cpu, int tif) +{ + /* + * Although need_resched leads to both the rescheduling and preempt_irq + * states, it is safer to start the monitor always in preempt_irq, + * which may not mirror the system state but makes the monitor simpler, + */ + if (tif == TIF_NEED_RESCHED) + da_handle_start_event_nrp(tsk, sched_need_resched_nrp); +} + +static void handle_schedule_entry(void *data, bool preempt) +{ + if (preempt) + da_handle_event_nrp(current, schedule_entry_preempt_nrp); + else + da_handle_event_nrp(current, schedule_entry_nrp); +} + +static int enable_nrp(void) +{ + int retval; + + retval = da_monitor_init_nrp(); + if (retval) + return retval; + + rv_attach_trace_probe("nrp", irq_handler_entry, handle_irq_entry); + rv_attach_trace_probe("nrp", sched_set_need_resched_tp, handle_sched_need_resched); + rv_attach_trace_probe("nrp", sched_entry_tp, handle_schedule_entry); + attach_vector_irq(); + + return 0; +} + +static void disable_nrp(void) +{ + rv_nrp.enabled = 0; + + rv_detach_trace_probe("nrp", irq_handler_entry, handle_irq_entry); + rv_detach_trace_probe("nrp", sched_set_need_resched_tp, handle_sched_need_resched); + rv_detach_trace_probe("nrp", sched_entry_tp, handle_schedule_entry); + detach_vector_irq(); + + da_monitor_destroy_nrp(); +} + +static struct rv_monitor rv_nrp = { + .name = "nrp", + .description = "need resched preempts.", + .enable = enable_nrp, + .disable = disable_nrp, + .reset = da_monitor_reset_all_nrp, + .enabled = 0, +}; + +static int __init register_nrp(void) +{ + return rv_register_monitor(&rv_nrp, &rv_sched); +} + +static void __exit unregister_nrp(void) +{ + rv_unregister_monitor(&rv_nrp); +} + +module_init(register_nrp); +module_exit(unregister_nrp); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("nrp: need resched preempts."); diff --git a/kernel/trace/rv/monitors/nrp/nrp.h b/kernel/trace/rv/monitors/nrp/nrp.h new file mode 100644 index 000000000000..c9f12207cbf6 --- /dev/null +++ b/kernel/trace/rv/monitors/nrp/nrp.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of nrp automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_nrp { + preempt_irq_nrp = 0, + any_thread_running_nrp, + nested_preempt_nrp, + rescheduling_nrp, + state_max_nrp +}; + +#define INVALID_STATE state_max_nrp + +enum events_nrp { + irq_entry_nrp = 0, + sched_need_resched_nrp, + schedule_entry_nrp, + schedule_entry_preempt_nrp, + event_max_nrp +}; + +struct automaton_nrp { + char *state_names[state_max_nrp]; + char *event_names[event_max_nrp]; + unsigned char function[state_max_nrp][event_max_nrp]; + unsigned char initial_state; + bool final_states[state_max_nrp]; +}; + +static const struct automaton_nrp automaton_nrp = { + .state_names = { + "preempt_irq", + "any_thread_running", + "nested_preempt", + "rescheduling" + }, + .event_names = { + "irq_entry", + "sched_need_resched", + "schedule_entry", + "schedule_entry_preempt" + }, + .function = { + { + preempt_irq_nrp, + preempt_irq_nrp, + nested_preempt_nrp, + nested_preempt_nrp + }, + { + any_thread_running_nrp, + rescheduling_nrp, + any_thread_running_nrp, + INVALID_STATE + }, + { + nested_preempt_nrp, + preempt_irq_nrp, + any_thread_running_nrp, + any_thread_running_nrp + }, + { + preempt_irq_nrp, + rescheduling_nrp, + any_thread_running_nrp, + any_thread_running_nrp + }, + }, + .initial_state = preempt_irq_nrp, + .final_states = { 0, 1, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/nrp/nrp_trace.h b/kernel/trace/rv/monitors/nrp/nrp_trace.h new file mode 100644 index 000000000000..2e13497de3b6 --- /dev/null +++ b/kernel/trace/rv/monitors/nrp/nrp_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_NRP +DEFINE_EVENT(event_da_monitor_id, event_nrp, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_nrp, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); +#endif /* CONFIG_RV_MON_NRP */ diff --git a/kernel/trace/rv/monitors/opid/Kconfig b/kernel/trace/rv/monitors/opid/Kconfig new file mode 100644 index 000000000000..561d32da572b --- /dev/null +++ b/kernel/trace/rv/monitors/opid/Kconfig @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_OPID + depends on RV + depends on TRACE_IRQFLAGS + depends on TRACE_PREEMPT_TOGGLE + depends on RV_MON_SCHED + default y if PREEMPT_RT + select DA_MON_EVENTS_IMPLICIT + bool "opid monitor" + help + Monitor to ensure operations like wakeup and need resched occur with + interrupts and preemption disabled or during IRQs, where preemption + may not be disabled explicitly. + + This monitor is unstable on !PREEMPT_RT, say N unless you are testing it. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/opid/opid.c b/kernel/trace/rv/monitors/opid/opid.c new file mode 100644 index 000000000000..50d64e7fb8c4 --- /dev/null +++ b/kernel/trace/rv/monitors/opid/opid.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "opid" + +#include <trace/events/sched.h> +#include <trace/events/irq.h> +#include <trace/events/preemptirq.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "opid.h" + +static struct rv_monitor rv_opid; +DECLARE_DA_MON_PER_CPU(opid, unsigned char); + +#ifdef CONFIG_X86_LOCAL_APIC +#include <asm/trace/irq_vectors.h> + +static void handle_vector_irq_entry(void *data, int vector) +{ + da_handle_event_opid(irq_entry_opid); +} + +static void attach_vector_irq(void) +{ + rv_attach_trace_probe("opid", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_attach_trace_probe("opid", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_attach_trace_probe("opid", reschedule_entry, handle_vector_irq_entry); + rv_attach_trace_probe("opid", call_function_entry, handle_vector_irq_entry); + rv_attach_trace_probe("opid", call_function_single_entry, handle_vector_irq_entry); + } +} + +static void detach_vector_irq(void) +{ + rv_detach_trace_probe("opid", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_detach_trace_probe("opid", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_detach_trace_probe("opid", reschedule_entry, handle_vector_irq_entry); + rv_detach_trace_probe("opid", call_function_entry, handle_vector_irq_entry); + rv_detach_trace_probe("opid", call_function_single_entry, handle_vector_irq_entry); + } +} + +#else +/* We assume irq_entry tracepoints are sufficient on other architectures */ +static void attach_vector_irq(void) { } +static void detach_vector_irq(void) { } +#endif + +static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_opid(irq_disable_opid); +} + +static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_opid(irq_enable_opid); +} + +static void handle_irq_entry(void *data, int irq, struct irqaction *action) +{ + da_handle_event_opid(irq_entry_opid); +} + +static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_opid(preempt_disable_opid); +} + +static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_opid(preempt_enable_opid); +} + +static void handle_sched_need_resched(void *data, struct task_struct *tsk, int cpu, int tif) +{ + /* The monitor's intitial state is not in_irq */ + if (this_cpu_read(hardirq_context)) + da_handle_event_opid(sched_need_resched_opid); + else + da_handle_start_event_opid(sched_need_resched_opid); +} + +static void handle_sched_waking(void *data, struct task_struct *p) +{ + /* The monitor's intitial state is not in_irq */ + if (this_cpu_read(hardirq_context)) + da_handle_event_opid(sched_waking_opid); + else + da_handle_start_event_opid(sched_waking_opid); +} + +static int enable_opid(void) +{ + int retval; + + retval = da_monitor_init_opid(); + if (retval) + return retval; + + rv_attach_trace_probe("opid", irq_disable, handle_irq_disable); + rv_attach_trace_probe("opid", irq_enable, handle_irq_enable); + rv_attach_trace_probe("opid", irq_handler_entry, handle_irq_entry); + rv_attach_trace_probe("opid", preempt_disable, handle_preempt_disable); + rv_attach_trace_probe("opid", preempt_enable, handle_preempt_enable); + rv_attach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched); + rv_attach_trace_probe("opid", sched_waking, handle_sched_waking); + attach_vector_irq(); + + return 0; +} + +static void disable_opid(void) +{ + rv_opid.enabled = 0; + + rv_detach_trace_probe("opid", irq_disable, handle_irq_disable); + rv_detach_trace_probe("opid", irq_enable, handle_irq_enable); + rv_detach_trace_probe("opid", irq_handler_entry, handle_irq_entry); + rv_detach_trace_probe("opid", preempt_disable, handle_preempt_disable); + rv_detach_trace_probe("opid", preempt_enable, handle_preempt_enable); + rv_detach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched); + rv_detach_trace_probe("opid", sched_waking, handle_sched_waking); + detach_vector_irq(); + + da_monitor_destroy_opid(); +} + +/* + * This is the monitor register section. + */ +static struct rv_monitor rv_opid = { + .name = "opid", + .description = "operations with preemption and irq disabled.", + .enable = enable_opid, + .disable = disable_opid, + .reset = da_monitor_reset_all_opid, + .enabled = 0, +}; + +static int __init register_opid(void) +{ + return rv_register_monitor(&rv_opid, &rv_sched); +} + +static void __exit unregister_opid(void) +{ + rv_unregister_monitor(&rv_opid); +} + +module_init(register_opid); +module_exit(unregister_opid); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("opid: operations with preemption and irq disabled."); diff --git a/kernel/trace/rv/monitors/opid/opid.h b/kernel/trace/rv/monitors/opid/opid.h new file mode 100644 index 000000000000..b4b8c2ff7f64 --- /dev/null +++ b/kernel/trace/rv/monitors/opid/opid.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of opid automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_opid { + disabled_opid = 0, + enabled_opid, + in_irq_opid, + irq_disabled_opid, + preempt_disabled_opid, + state_max_opid +}; + +#define INVALID_STATE state_max_opid + +enum events_opid { + irq_disable_opid = 0, + irq_enable_opid, + irq_entry_opid, + preempt_disable_opid, + preempt_enable_opid, + sched_need_resched_opid, + sched_waking_opid, + event_max_opid +}; + +struct automaton_opid { + char *state_names[state_max_opid]; + char *event_names[event_max_opid]; + unsigned char function[state_max_opid][event_max_opid]; + unsigned char initial_state; + bool final_states[state_max_opid]; +}; + +static const struct automaton_opid automaton_opid = { + .state_names = { + "disabled", + "enabled", + "in_irq", + "irq_disabled", + "preempt_disabled" + }, + .event_names = { + "irq_disable", + "irq_enable", + "irq_entry", + "preempt_disable", + "preempt_enable", + "sched_need_resched", + "sched_waking" + }, + .function = { + { + INVALID_STATE, + preempt_disabled_opid, + disabled_opid, + INVALID_STATE, + irq_disabled_opid, + disabled_opid, + disabled_opid + }, + { + irq_disabled_opid, + INVALID_STATE, + INVALID_STATE, + preempt_disabled_opid, + enabled_opid, + INVALID_STATE, + INVALID_STATE + }, + { + INVALID_STATE, + enabled_opid, + in_irq_opid, + INVALID_STATE, + INVALID_STATE, + in_irq_opid, + in_irq_opid + }, + { + INVALID_STATE, + enabled_opid, + in_irq_opid, + disabled_opid, + INVALID_STATE, + irq_disabled_opid, + INVALID_STATE + }, + { + disabled_opid, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + enabled_opid, + INVALID_STATE, + INVALID_STATE + }, + }, + .initial_state = disabled_opid, + .final_states = { 0, 1, 0, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/sncid/sncid_trace.h b/kernel/trace/rv/monitors/opid/opid_trace.h index 3ce42a57671d..3df6ff955c30 100644 --- a/kernel/trace/rv/monitors/sncid/sncid_trace.h +++ b/kernel/trace/rv/monitors/opid/opid_trace.h @@ -4,12 +4,12 @@ * Snippet to be included in rv_trace.h */ -#ifdef CONFIG_RV_MON_SNCID -DEFINE_EVENT(event_da_monitor, event_sncid, +#ifdef CONFIG_RV_MON_OPID +DEFINE_EVENT(event_da_monitor, event_opid, TP_PROTO(char *state, char *event, char *next_state, bool final_state), TP_ARGS(state, event, next_state, final_state)); -DEFINE_EVENT(error_da_monitor, error_sncid, +DEFINE_EVENT(error_da_monitor, error_opid, TP_PROTO(char *state, char *event), TP_ARGS(state, event)); -#endif /* CONFIG_RV_MON_SNCID */ +#endif /* CONFIG_RV_MON_OPID */ diff --git a/kernel/trace/rv/monitors/pagefault/Kconfig b/kernel/trace/rv/monitors/pagefault/Kconfig new file mode 100644 index 000000000000..5e16625f1653 --- /dev/null +++ b/kernel/trace/rv/monitors/pagefault/Kconfig @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_PAGEFAULT + depends on RV + select RV_LTL_MONITOR + depends on RV_MON_RTAPP + depends on X86 || RISCV + default y + select LTL_MON_EVENTS_ID + bool "pagefault monitor" + help + Monitor that real-time tasks do not raise page faults, causing + undesirable latency. + + If you are developing a real-time system and not entirely sure whether + the applications are designed correctly for real-time, you want to say + Y here. + + This monitor does not affect execution speed while it is not running, + therefore it is safe to enable this in production kernel. diff --git a/kernel/trace/rv/monitors/pagefault/pagefault.c b/kernel/trace/rv/monitors/pagefault/pagefault.c new file mode 100644 index 000000000000..9fe6123b2200 --- /dev/null +++ b/kernel/trace/rv/monitors/pagefault/pagefault.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/rv.h> +#include <linux/sched/deadline.h> +#include <linux/sched/rt.h> +#include <linux/tracepoint.h> +#include <rv/instrumentation.h> + +#define MODULE_NAME "pagefault" + +#include <rv_trace.h> +#include <trace/events/exceptions.h> +#include <monitors/rtapp/rtapp.h> + +#include "pagefault.h" +#include <rv/ltl_monitor.h> + +static void ltl_atoms_fetch(struct task_struct *task, struct ltl_monitor *mon) +{ + /* + * This includes "actual" real-time tasks and also PI-boosted + * tasks. A task being PI-boosted means it is blocking an "actual" + * real-task, therefore it should also obey the monitor's rule, + * otherwise the "actual" real-task may be delayed. + */ + ltl_atom_set(mon, LTL_RT, rt_or_dl_task(task)); +} + +static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bool task_creation) +{ + if (task_creation) + ltl_atom_set(mon, LTL_PAGEFAULT, false); +} + +static void handle_page_fault(void *data, unsigned long address, struct pt_regs *regs, + unsigned long error_code) +{ + ltl_atom_pulse(current, LTL_PAGEFAULT, true); +} + +static int enable_pagefault(void) +{ + int retval; + + retval = ltl_monitor_init(); + if (retval) + return retval; + + rv_attach_trace_probe("rtapp_pagefault", page_fault_kernel, handle_page_fault); + rv_attach_trace_probe("rtapp_pagefault", page_fault_user, handle_page_fault); + + return 0; +} + +static void disable_pagefault(void) +{ + rv_detach_trace_probe("rtapp_pagefault", page_fault_kernel, handle_page_fault); + rv_detach_trace_probe("rtapp_pagefault", page_fault_user, handle_page_fault); + + ltl_monitor_destroy(); +} + +static struct rv_monitor rv_pagefault = { + .name = "pagefault", + .description = "Monitor that RT tasks do not raise page faults", + .enable = enable_pagefault, + .disable = disable_pagefault, +}; + +static int __init register_pagefault(void) +{ + return rv_register_monitor(&rv_pagefault, &rv_rtapp); +} + +static void __exit unregister_pagefault(void) +{ + rv_unregister_monitor(&rv_pagefault); +} + +module_init(register_pagefault); +module_exit(unregister_pagefault); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>"); +MODULE_DESCRIPTION("pagefault: Monitor that RT tasks do not raise page faults"); diff --git a/kernel/trace/rv/monitors/pagefault/pagefault.h b/kernel/trace/rv/monitors/pagefault/pagefault.h new file mode 100644 index 000000000000..c580ec194009 --- /dev/null +++ b/kernel/trace/rv/monitors/pagefault/pagefault.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * C implementation of Buchi automaton, automatically generated by + * tools/verification/rvgen from the linear temporal logic specification. + * For further information, see kernel documentation: + * Documentation/trace/rv/linear_temporal_logic.rst + */ + +#include <linux/rv.h> + +#define MONITOR_NAME pagefault + +enum ltl_atom { + LTL_PAGEFAULT, + LTL_RT, + LTL_NUM_ATOM +}; +static_assert(LTL_NUM_ATOM <= RV_MAX_LTL_ATOM); + +static const char *ltl_atom_str(enum ltl_atom atom) +{ + static const char *const names[] = { + "pa", + "rt", + }; + + return names[atom]; +} + +enum ltl_buchi_state { + S0, + RV_NUM_BA_STATES +}; +static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES); + +static void ltl_start(struct task_struct *task, struct ltl_monitor *mon) +{ + bool pagefault = test_bit(LTL_PAGEFAULT, mon->atoms); + bool val3 = !pagefault; + bool rt = test_bit(LTL_RT, mon->atoms); + bool val1 = !rt; + bool val4 = val1 || val3; + + if (val4) + __set_bit(S0, mon->states); +} + +static void +ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned long *next) +{ + bool pagefault = test_bit(LTL_PAGEFAULT, mon->atoms); + bool val3 = !pagefault; + bool rt = test_bit(LTL_RT, mon->atoms); + bool val1 = !rt; + bool val4 = val1 || val3; + + switch (state) { + case S0: + if (val4) + __set_bit(S0, next); + break; + } +} diff --git a/kernel/trace/rv/monitors/pagefault/pagefault_trace.h b/kernel/trace/rv/monitors/pagefault/pagefault_trace.h new file mode 100644 index 000000000000..fe1f82597b1a --- /dev/null +++ b/kernel/trace/rv/monitors/pagefault/pagefault_trace.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_PAGEFAULT +DEFINE_EVENT(event_ltl_monitor_id, event_pagefault, + TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next), + TP_ARGS(task, states, atoms, next)); +DEFINE_EVENT(error_ltl_monitor_id, error_pagefault, + TP_PROTO(struct task_struct *task), + TP_ARGS(task)); +#endif /* CONFIG_RV_MON_PAGEFAULT */ diff --git a/kernel/trace/rv/monitors/rtapp/Kconfig b/kernel/trace/rv/monitors/rtapp/Kconfig new file mode 100644 index 000000000000..1ce9370a9ba8 --- /dev/null +++ b/kernel/trace/rv/monitors/rtapp/Kconfig @@ -0,0 +1,11 @@ +config RV_MON_RTAPP + depends on RV + depends on RV_PER_TASK_MONITORS >= 2 + bool "rtapp monitor" + help + Collection of monitors to check for common problems with real-time + application that may cause unexpected latency. + + If you are developing a real-time system and not entirely sure whether + the applications are designed correctly for real-time, you want to say + Y here. diff --git a/kernel/trace/rv/monitors/rtapp/rtapp.c b/kernel/trace/rv/monitors/rtapp/rtapp.c new file mode 100644 index 000000000000..fd75fc927d65 --- /dev/null +++ b/kernel/trace/rv/monitors/rtapp/rtapp.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> + +#define MODULE_NAME "rtapp" + +#include "rtapp.h" + +struct rv_monitor rv_rtapp; + +struct rv_monitor rv_rtapp = { + .name = "rtapp", + .description = "Collection of monitors for detecting problems with real-time applications", +}; + +static int __init register_rtapp(void) +{ + return rv_register_monitor(&rv_rtapp, NULL); +} + +static void __exit unregister_rtapp(void) +{ + rv_unregister_monitor(&rv_rtapp); +} + +module_init(register_rtapp); +module_exit(unregister_rtapp); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>"); +MODULE_DESCRIPTION("Collection of monitors for detecting problems with real-time applications"); diff --git a/kernel/trace/rv/monitors/rtapp/rtapp.h b/kernel/trace/rv/monitors/rtapp/rtapp.h new file mode 100644 index 000000000000..4c200d67c7f6 --- /dev/null +++ b/kernel/trace/rv/monitors/rtapp/rtapp.h @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +extern struct rv_monitor rv_rtapp; diff --git a/kernel/trace/rv/monitors/sched/Kconfig b/kernel/trace/rv/monitors/sched/Kconfig index ae3eb410abd7..aa16456da864 100644 --- a/kernel/trace/rv/monitors/sched/Kconfig +++ b/kernel/trace/rv/monitors/sched/Kconfig @@ -2,6 +2,7 @@ # config RV_MON_SCHED depends on RV + depends on RV_PER_TASK_MONITORS >= 3 bool "sched monitor" help Collection of monitors to check the scheduler behaves according to specifications. diff --git a/kernel/trace/rv/monitors/sched/sched.c b/kernel/trace/rv/monitors/sched/sched.c index 905e03c3c934..d04db4b543f9 100644 --- a/kernel/trace/rv/monitors/sched/sched.c +++ b/kernel/trace/rv/monitors/sched/sched.c @@ -21,8 +21,7 @@ struct rv_monitor rv_sched = { static int __init register_sched(void) { - rv_register_monitor(&rv_sched, NULL); - return 0; + return rv_register_monitor(&rv_sched, NULL); } static void __exit unregister_sched(void) diff --git a/kernel/trace/rv/monitors/sco/sco.c b/kernel/trace/rv/monitors/sco/sco.c index 4cff59220bfc..04c36405e2e3 100644 --- a/kernel/trace/rv/monitors/sco/sco.c +++ b/kernel/trace/rv/monitors/sco/sco.c @@ -24,12 +24,12 @@ static void handle_sched_set_state(void *data, struct task_struct *tsk, int stat da_handle_start_event_sco(sched_set_state_sco); } -static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +static void handle_schedule_entry(void *data, bool preempt) { da_handle_event_sco(schedule_entry_sco); } -static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +static void handle_schedule_exit(void *data, bool is_switch) { da_handle_start_event_sco(schedule_exit_sco); } @@ -71,8 +71,7 @@ static struct rv_monitor rv_sco = { static int __init register_sco(void) { - rv_register_monitor(&rv_sco, &rv_sched); - return 0; + return rv_register_monitor(&rv_sco, &rv_sched); } static void __exit unregister_sco(void) diff --git a/kernel/trace/rv/monitors/scpd/Kconfig b/kernel/trace/rv/monitors/scpd/Kconfig index b9114fbf680f..682d0416188b 100644 --- a/kernel/trace/rv/monitors/scpd/Kconfig +++ b/kernel/trace/rv/monitors/scpd/Kconfig @@ -2,7 +2,7 @@ # config RV_MON_SCPD depends on RV - depends on PREEMPT_TRACER + depends on TRACE_PREEMPT_TOGGLE depends on RV_MON_SCHED default y select DA_MON_EVENTS_IMPLICIT diff --git a/kernel/trace/rv/monitors/scpd/scpd.c b/kernel/trace/rv/monitors/scpd/scpd.c index cbdd6a5f8d7f..1e351ba52fee 100644 --- a/kernel/trace/rv/monitors/scpd/scpd.c +++ b/kernel/trace/rv/monitors/scpd/scpd.c @@ -30,12 +30,12 @@ static void handle_preempt_enable(void *data, unsigned long ip, unsigned long pa da_handle_start_event_scpd(preempt_enable_scpd); } -static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +static void handle_schedule_entry(void *data, bool preempt) { da_handle_event_scpd(schedule_entry_scpd); } -static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +static void handle_schedule_exit(void *data, bool is_switch) { da_handle_event_scpd(schedule_exit_scpd); } @@ -79,8 +79,7 @@ static struct rv_monitor rv_scpd = { static int __init register_scpd(void) { - rv_register_monitor(&rv_scpd, &rv_sched); - return 0; + return rv_register_monitor(&rv_scpd, &rv_sched); } static void __exit unregister_scpd(void) diff --git a/kernel/trace/rv/monitors/sleep/Kconfig b/kernel/trace/rv/monitors/sleep/Kconfig new file mode 100644 index 000000000000..6b7a122e7b47 --- /dev/null +++ b/kernel/trace/rv/monitors/sleep/Kconfig @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SLEEP + depends on RV + select RV_LTL_MONITOR + depends on HAVE_SYSCALL_TRACEPOINTS + depends on RV_MON_RTAPP + select TRACE_IRQFLAGS + default y + select LTL_MON_EVENTS_ID + bool "sleep monitor" + help + Monitor that real-time tasks do not sleep in a manner that may + cause undesirable latency. + + If you are developing a real-time system and not entirely sure whether + the applications are designed correctly for real-time, you want to say + Y here. + + Enabling this monitor may have performance impact (due to select + TRACE_IRQFLAGS). Therefore, you probably should say N for + production kernel. diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c new file mode 100644 index 000000000000..eea447b06907 --- /dev/null +++ b/kernel/trace/rv/monitors/sleep/sleep.c @@ -0,0 +1,237 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/init.h> +#include <linux/irqflags.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/rv.h> +#include <linux/sched/deadline.h> +#include <linux/sched/rt.h> +#include <rv/instrumentation.h> + +#define MODULE_NAME "sleep" + +#include <trace/events/syscalls.h> +#include <trace/events/sched.h> +#include <trace/events/lock.h> +#include <uapi/linux/futex.h> +#include <rv_trace.h> +#include <monitors/rtapp/rtapp.h> + +#include "sleep.h" +#include <rv/ltl_monitor.h> + +static void ltl_atoms_fetch(struct task_struct *task, struct ltl_monitor *mon) +{ + /* + * This includes "actual" real-time tasks and also PI-boosted + * tasks. A task being PI-boosted means it is blocking an "actual" + * real-task, therefore it should also obey the monitor's rule, + * otherwise the "actual" real-task may be delayed. + */ + ltl_atom_set(mon, LTL_RT, rt_or_dl_task(task)); +} + +static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bool task_creation) +{ + ltl_atom_set(mon, LTL_SLEEP, false); + ltl_atom_set(mon, LTL_WAKE, false); + ltl_atom_set(mon, LTL_ABORT_SLEEP, false); + ltl_atom_set(mon, LTL_WOKEN_BY_HARDIRQ, false); + ltl_atom_set(mon, LTL_WOKEN_BY_NMI, false); + ltl_atom_set(mon, LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, false); + + if (task_creation) { + ltl_atom_set(mon, LTL_KTHREAD_SHOULD_STOP, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false); + ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false); + ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false); + ltl_atom_set(mon, LTL_FUTEX_WAIT, false); + ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false); + ltl_atom_set(mon, LTL_BLOCK_ON_RT_MUTEX, false); + } + + if (task->flags & PF_KTHREAD) { + ltl_atom_set(mon, LTL_KERNEL_THREAD, true); + + /* kernel tasks do not do syscall */ + ltl_atom_set(mon, LTL_FUTEX_WAIT, false); + ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false); + ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false); + ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false); + + if (strstarts(task->comm, "migration/")) + ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, true); + else + ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, false); + + if (strstarts(task->comm, "rcu")) + ltl_atom_set(mon, LTL_TASK_IS_RCU, true); + else + ltl_atom_set(mon, LTL_TASK_IS_RCU, false); + } else { + ltl_atom_set(mon, LTL_KTHREAD_SHOULD_STOP, false); + ltl_atom_set(mon, LTL_KERNEL_THREAD, false); + ltl_atom_set(mon, LTL_TASK_IS_RCU, false); + ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, false); + } + +} + +static void handle_sched_set_state(void *data, struct task_struct *task, int state) +{ + if (state & TASK_INTERRUPTIBLE) + ltl_atom_pulse(task, LTL_SLEEP, true); + else if (state == TASK_RUNNING) + ltl_atom_pulse(task, LTL_ABORT_SLEEP, true); +} + +static void handle_sched_wakeup(void *data, struct task_struct *task) +{ + ltl_atom_pulse(task, LTL_WAKE, true); +} + +static void handle_sched_waking(void *data, struct task_struct *task) +{ + if (this_cpu_read(hardirq_context)) { + ltl_atom_pulse(task, LTL_WOKEN_BY_HARDIRQ, true); + } else if (in_task()) { + if (current->prio <= task->prio) + ltl_atom_pulse(task, LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, true); + } else if (in_nmi()) { + ltl_atom_pulse(task, LTL_WOKEN_BY_NMI, true); + } +} + +static void handle_contention_begin(void *data, void *lock, unsigned int flags) +{ + if (flags & LCB_F_RT) + ltl_atom_update(current, LTL_BLOCK_ON_RT_MUTEX, true); +} + +static void handle_contention_end(void *data, void *lock, int ret) +{ + ltl_atom_update(current, LTL_BLOCK_ON_RT_MUTEX, false); +} + +static void handle_sys_enter(void *data, struct pt_regs *regs, long id) +{ + struct ltl_monitor *mon; + unsigned long args[6]; + int op, cmd; + + mon = ltl_get_monitor(current); + + switch (id) { + case __NR_clock_nanosleep: +#ifdef __NR_clock_nanosleep_time64 + case __NR_clock_nanosleep_time64: +#endif + syscall_get_arguments(current, regs, args); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, args[0] == CLOCK_MONOTONIC); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, args[0] == CLOCK_TAI); + ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, args[1] == TIMER_ABSTIME); + ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, true); + break; + + case __NR_futex: +#ifdef __NR_futex_time64 + case __NR_futex_time64: +#endif + syscall_get_arguments(current, regs, args); + op = args[1]; + cmd = op & FUTEX_CMD_MASK; + + switch (cmd) { + case FUTEX_LOCK_PI: + case FUTEX_LOCK_PI2: + ltl_atom_update(current, LTL_FUTEX_LOCK_PI, true); + break; + case FUTEX_WAIT: + case FUTEX_WAIT_BITSET: + case FUTEX_WAIT_REQUEUE_PI: + ltl_atom_update(current, LTL_FUTEX_WAIT, true); + break; + } + break; + } +} + +static void handle_sys_exit(void *data, struct pt_regs *regs, long ret) +{ + struct ltl_monitor *mon = ltl_get_monitor(current); + + ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false); + ltl_atom_set(mon, LTL_FUTEX_WAIT, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false); + ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false); + ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, false); +} + +static void handle_kthread_stop(void *data, struct task_struct *task) +{ + /* FIXME: this could race with other tracepoint handlers */ + ltl_atom_update(task, LTL_KTHREAD_SHOULD_STOP, true); +} + +static int enable_sleep(void) +{ + int retval; + + retval = ltl_monitor_init(); + if (retval) + return retval; + + rv_attach_trace_probe("rtapp_sleep", sched_waking, handle_sched_waking); + rv_attach_trace_probe("rtapp_sleep", sched_wakeup, handle_sched_wakeup); + rv_attach_trace_probe("rtapp_sleep", sched_set_state_tp, handle_sched_set_state); + rv_attach_trace_probe("rtapp_sleep", contention_begin, handle_contention_begin); + rv_attach_trace_probe("rtapp_sleep", contention_end, handle_contention_end); + rv_attach_trace_probe("rtapp_sleep", sched_kthread_stop, handle_kthread_stop); + rv_attach_trace_probe("rtapp_sleep", sys_enter, handle_sys_enter); + rv_attach_trace_probe("rtapp_sleep", sys_exit, handle_sys_exit); + return 0; +} + +static void disable_sleep(void) +{ + rv_detach_trace_probe("rtapp_sleep", sched_waking, handle_sched_waking); + rv_detach_trace_probe("rtapp_sleep", sched_wakeup, handle_sched_wakeup); + rv_detach_trace_probe("rtapp_sleep", sched_set_state_tp, handle_sched_set_state); + rv_detach_trace_probe("rtapp_sleep", contention_begin, handle_contention_begin); + rv_detach_trace_probe("rtapp_sleep", contention_end, handle_contention_end); + rv_detach_trace_probe("rtapp_sleep", sched_kthread_stop, handle_kthread_stop); + rv_detach_trace_probe("rtapp_sleep", sys_enter, handle_sys_enter); + rv_detach_trace_probe("rtapp_sleep", sys_exit, handle_sys_exit); + + ltl_monitor_destroy(); +} + +static struct rv_monitor rv_sleep = { + .name = "sleep", + .description = "Monitor that RT tasks do not undesirably sleep", + .enable = enable_sleep, + .disable = disable_sleep, +}; + +static int __init register_sleep(void) +{ + return rv_register_monitor(&rv_sleep, &rv_rtapp); +} + +static void __exit unregister_sleep(void) +{ + rv_unregister_monitor(&rv_sleep); +} + +module_init(register_sleep); +module_exit(unregister_sleep); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>"); +MODULE_DESCRIPTION("sleep: Monitor that RT tasks do not undesirably sleep"); diff --git a/kernel/trace/rv/monitors/sleep/sleep.h b/kernel/trace/rv/monitors/sleep/sleep.h new file mode 100644 index 000000000000..2ab46fd218d2 --- /dev/null +++ b/kernel/trace/rv/monitors/sleep/sleep.h @@ -0,0 +1,257 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * C implementation of Buchi automaton, automatically generated by + * tools/verification/rvgen from the linear temporal logic specification. + * For further information, see kernel documentation: + * Documentation/trace/rv/linear_temporal_logic.rst + */ + +#include <linux/rv.h> + +#define MONITOR_NAME sleep + +enum ltl_atom { + LTL_ABORT_SLEEP, + LTL_BLOCK_ON_RT_MUTEX, + LTL_CLOCK_NANOSLEEP, + LTL_FUTEX_LOCK_PI, + LTL_FUTEX_WAIT, + LTL_KERNEL_THREAD, + LTL_KTHREAD_SHOULD_STOP, + LTL_NANOSLEEP_CLOCK_MONOTONIC, + LTL_NANOSLEEP_CLOCK_TAI, + LTL_NANOSLEEP_TIMER_ABSTIME, + LTL_RT, + LTL_SLEEP, + LTL_TASK_IS_MIGRATION, + LTL_TASK_IS_RCU, + LTL_WAKE, + LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, + LTL_WOKEN_BY_HARDIRQ, + LTL_WOKEN_BY_NMI, + LTL_NUM_ATOM +}; +static_assert(LTL_NUM_ATOM <= RV_MAX_LTL_ATOM); + +static const char *ltl_atom_str(enum ltl_atom atom) +{ + static const char *const names[] = { + "ab_sl", + "bl_on_rt_mu", + "cl_na", + "fu_lo_pi", + "fu_wa", + "ker_th", + "kth_sh_st", + "na_cl_mo", + "na_cl_ta", + "na_ti_ab", + "rt", + "sl", + "ta_mi", + "ta_rc", + "wak", + "wo_eq_hi_pr", + "wo_ha", + "wo_nm", + }; + + return names[atom]; +} + +enum ltl_buchi_state { + S0, + S1, + S2, + S3, + S4, + S5, + S6, + S7, + RV_NUM_BA_STATES +}; +static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES); + +static void ltl_start(struct task_struct *task, struct ltl_monitor *mon) +{ + bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms); + bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms); + bool val40 = task_is_rcu || task_is_migration; + bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms); + bool val41 = futex_lock_pi || val40; + bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms); + bool val5 = block_on_rt_mutex || val41; + bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms); + bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms); + bool val32 = abort_sleep || kthread_should_stop; + bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms); + bool val33 = woken_by_nmi || val32; + bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms); + bool val34 = woken_by_hardirq || val33; + bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, + mon->atoms); + bool val14 = woken_by_equal_or_higher_prio || val34; + bool wake = test_bit(LTL_WAKE, mon->atoms); + bool val13 = !wake; + bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms); + bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms); + bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms); + bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai; + bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms); + bool val25 = nanosleep_timer_abstime && val24; + bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms); + bool val18 = clock_nanosleep && val25; + bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms); + bool val9 = futex_wait || val18; + bool val11 = val9 || kernel_thread; + bool sleep = test_bit(LTL_SLEEP, mon->atoms); + bool val2 = !sleep; + bool rt = test_bit(LTL_RT, mon->atoms); + bool val1 = !rt; + bool val3 = val1 || val2; + + if (val3) + __set_bit(S0, mon->states); + if (val11 && val13) + __set_bit(S1, mon->states); + if (val11 && val14) + __set_bit(S4, mon->states); + if (val5) + __set_bit(S5, mon->states); +} + +static void +ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned long *next) +{ + bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms); + bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms); + bool val40 = task_is_rcu || task_is_migration; + bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms); + bool val41 = futex_lock_pi || val40; + bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms); + bool val5 = block_on_rt_mutex || val41; + bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms); + bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms); + bool val32 = abort_sleep || kthread_should_stop; + bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms); + bool val33 = woken_by_nmi || val32; + bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms); + bool val34 = woken_by_hardirq || val33; + bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, + mon->atoms); + bool val14 = woken_by_equal_or_higher_prio || val34; + bool wake = test_bit(LTL_WAKE, mon->atoms); + bool val13 = !wake; + bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms); + bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms); + bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms); + bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai; + bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms); + bool val25 = nanosleep_timer_abstime && val24; + bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms); + bool val18 = clock_nanosleep && val25; + bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms); + bool val9 = futex_wait || val18; + bool val11 = val9 || kernel_thread; + bool sleep = test_bit(LTL_SLEEP, mon->atoms); + bool val2 = !sleep; + bool rt = test_bit(LTL_RT, mon->atoms); + bool val1 = !rt; + bool val3 = val1 || val2; + + switch (state) { + case S0: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + case S1: + if (val11 && val13) + __set_bit(S1, next); + if (val13 && val3) + __set_bit(S2, next); + if (val14 && val3) + __set_bit(S3, next); + if (val11 && val14) + __set_bit(S4, next); + if (val13 && val5) + __set_bit(S6, next); + if (val14 && val5) + __set_bit(S7, next); + break; + case S2: + if (val11 && val13) + __set_bit(S1, next); + if (val13 && val3) + __set_bit(S2, next); + if (val14 && val3) + __set_bit(S3, next); + if (val11 && val14) + __set_bit(S4, next); + if (val13 && val5) + __set_bit(S6, next); + if (val14 && val5) + __set_bit(S7, next); + break; + case S3: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + case S4: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + case S5: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + case S6: + if (val11 && val13) + __set_bit(S1, next); + if (val13 && val3) + __set_bit(S2, next); + if (val14 && val3) + __set_bit(S3, next); + if (val11 && val14) + __set_bit(S4, next); + if (val13 && val5) + __set_bit(S6, next); + if (val14 && val5) + __set_bit(S7, next); + break; + case S7: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + } +} diff --git a/kernel/trace/rv/monitors/sleep/sleep_trace.h b/kernel/trace/rv/monitors/sleep/sleep_trace.h new file mode 100644 index 000000000000..22eaf31da987 --- /dev/null +++ b/kernel/trace/rv/monitors/sleep/sleep_trace.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SLEEP +DEFINE_EVENT(event_ltl_monitor_id, event_sleep, + TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next), + TP_ARGS(task, states, atoms, next)); +DEFINE_EVENT(error_ltl_monitor_id, error_sleep, + TP_PROTO(struct task_struct *task), + TP_ARGS(task)); +#endif /* CONFIG_RV_MON_SLEEP */ diff --git a/kernel/trace/rv/monitors/sncid/sncid.c b/kernel/trace/rv/monitors/sncid/sncid.c deleted file mode 100644 index f5037cd6214c..000000000000 --- a/kernel/trace/rv/monitors/sncid/sncid.c +++ /dev/null @@ -1,96 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/ftrace.h> -#include <linux/tracepoint.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/rv.h> -#include <rv/instrumentation.h> -#include <rv/da_monitor.h> - -#define MODULE_NAME "sncid" - -#include <trace/events/sched.h> -#include <trace/events/preemptirq.h> -#include <rv_trace.h> -#include <monitors/sched/sched.h> - -#include "sncid.h" - -static struct rv_monitor rv_sncid; -DECLARE_DA_MON_PER_CPU(sncid, unsigned char); - -static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip) -{ - da_handle_event_sncid(irq_disable_sncid); -} - -static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip) -{ - da_handle_start_event_sncid(irq_enable_sncid); -} - -static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) -{ - da_handle_start_event_sncid(schedule_entry_sncid); -} - -static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) -{ - da_handle_start_event_sncid(schedule_exit_sncid); -} - -static int enable_sncid(void) -{ - int retval; - - retval = da_monitor_init_sncid(); - if (retval) - return retval; - - rv_attach_trace_probe("sncid", irq_disable, handle_irq_disable); - rv_attach_trace_probe("sncid", irq_enable, handle_irq_enable); - rv_attach_trace_probe("sncid", sched_entry_tp, handle_schedule_entry); - rv_attach_trace_probe("sncid", sched_exit_tp, handle_schedule_exit); - - return 0; -} - -static void disable_sncid(void) -{ - rv_sncid.enabled = 0; - - rv_detach_trace_probe("sncid", irq_disable, handle_irq_disable); - rv_detach_trace_probe("sncid", irq_enable, handle_irq_enable); - rv_detach_trace_probe("sncid", sched_entry_tp, handle_schedule_entry); - rv_detach_trace_probe("sncid", sched_exit_tp, handle_schedule_exit); - - da_monitor_destroy_sncid(); -} - -static struct rv_monitor rv_sncid = { - .name = "sncid", - .description = "schedule not called with interrupt disabled.", - .enable = enable_sncid, - .disable = disable_sncid, - .reset = da_monitor_reset_all_sncid, - .enabled = 0, -}; - -static int __init register_sncid(void) -{ - rv_register_monitor(&rv_sncid, &rv_sched); - return 0; -} - -static void __exit unregister_sncid(void) -{ - rv_unregister_monitor(&rv_sncid); -} - -module_init(register_sncid); -module_exit(unregister_sncid); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); -MODULE_DESCRIPTION("sncid: schedule not called with interrupt disabled."); diff --git a/kernel/trace/rv/monitors/sncid/sncid.h b/kernel/trace/rv/monitors/sncid/sncid.h deleted file mode 100644 index 21304725142b..000000000000 --- a/kernel/trace/rv/monitors/sncid/sncid.h +++ /dev/null @@ -1,49 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Automatically generated C representation of sncid automaton - * For further information about this format, see kernel documentation: - * Documentation/trace/rv/deterministic_automata.rst - */ - -enum states_sncid { - can_sched_sncid = 0, - cant_sched_sncid, - state_max_sncid -}; - -#define INVALID_STATE state_max_sncid - -enum events_sncid { - irq_disable_sncid = 0, - irq_enable_sncid, - schedule_entry_sncid, - schedule_exit_sncid, - event_max_sncid -}; - -struct automaton_sncid { - char *state_names[state_max_sncid]; - char *event_names[event_max_sncid]; - unsigned char function[state_max_sncid][event_max_sncid]; - unsigned char initial_state; - bool final_states[state_max_sncid]; -}; - -static const struct automaton_sncid automaton_sncid = { - .state_names = { - "can_sched", - "cant_sched" - }, - .event_names = { - "irq_disable", - "irq_enable", - "schedule_entry", - "schedule_exit" - }, - .function = { - { cant_sched_sncid, INVALID_STATE, can_sched_sncid, can_sched_sncid }, - { INVALID_STATE, can_sched_sncid, INVALID_STATE, INVALID_STATE }, - }, - .initial_state = can_sched_sncid, - .final_states = { 1, 0 }, -}; diff --git a/kernel/trace/rv/monitors/snep/Kconfig b/kernel/trace/rv/monitors/snep/Kconfig index 77527f971232..7dd54f434ff7 100644 --- a/kernel/trace/rv/monitors/snep/Kconfig +++ b/kernel/trace/rv/monitors/snep/Kconfig @@ -2,7 +2,7 @@ # config RV_MON_SNEP depends on RV - depends on PREEMPT_TRACER + depends on TRACE_PREEMPT_TOGGLE depends on RV_MON_SCHED default y select DA_MON_EVENTS_IMPLICIT diff --git a/kernel/trace/rv/monitors/snep/snep.c b/kernel/trace/rv/monitors/snep/snep.c index 0076ba6d7ea4..558950f524a5 100644 --- a/kernel/trace/rv/monitors/snep/snep.c +++ b/kernel/trace/rv/monitors/snep/snep.c @@ -30,12 +30,12 @@ static void handle_preempt_enable(void *data, unsigned long ip, unsigned long pa da_handle_start_event_snep(preempt_enable_snep); } -static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +static void handle_schedule_entry(void *data, bool preempt) { da_handle_event_snep(schedule_entry_snep); } -static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +static void handle_schedule_exit(void *data, bool is_switch) { da_handle_start_event_snep(schedule_exit_snep); } @@ -79,8 +79,7 @@ static struct rv_monitor rv_snep = { static int __init register_snep(void) { - rv_register_monitor(&rv_snep, &rv_sched); - return 0; + return rv_register_monitor(&rv_snep, &rv_sched); } static void __exit unregister_snep(void) diff --git a/kernel/trace/rv/monitors/snep/snep.h b/kernel/trace/rv/monitors/snep/snep.h index 6d16b9ad931e..4cd9abb77b7b 100644 --- a/kernel/trace/rv/monitors/snep/snep.h +++ b/kernel/trace/rv/monitors/snep/snep.h @@ -41,8 +41,18 @@ static const struct automaton_snep automaton_snep = { "schedule_exit" }, .function = { - { non_scheduling_context_snep, non_scheduling_context_snep, scheduling_contex_snep, INVALID_STATE }, - { INVALID_STATE, INVALID_STATE, INVALID_STATE, non_scheduling_context_snep }, + { + non_scheduling_context_snep, + non_scheduling_context_snep, + scheduling_contex_snep, + INVALID_STATE + }, + { + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + non_scheduling_context_snep + }, }, .initial_state = non_scheduling_context_snep, .final_states = { 1, 0 }, diff --git a/kernel/trace/rv/monitors/snroc/snroc.c b/kernel/trace/rv/monitors/snroc/snroc.c index bb1f60d55296..540e686e699f 100644 --- a/kernel/trace/rv/monitors/snroc/snroc.c +++ b/kernel/trace/rv/monitors/snroc/snroc.c @@ -68,8 +68,7 @@ static struct rv_monitor rv_snroc = { static int __init register_snroc(void) { - rv_register_monitor(&rv_snroc, &rv_sched); - return 0; + return rv_register_monitor(&rv_snroc, &rv_sched); } static void __exit unregister_snroc(void) diff --git a/kernel/trace/rv/monitors/sncid/Kconfig b/kernel/trace/rv/monitors/sssw/Kconfig index 76bcfef4fd10..23b7eeb38bbf 100644 --- a/kernel/trace/rv/monitors/sncid/Kconfig +++ b/kernel/trace/rv/monitors/sssw/Kconfig @@ -1,14 +1,14 @@ # SPDX-License-Identifier: GPL-2.0-only # -config RV_MON_SNCID +config RV_MON_SSSW depends on RV - depends on IRQSOFF_TRACER depends on RV_MON_SCHED default y - select DA_MON_EVENTS_IMPLICIT - bool "sncid monitor" + select DA_MON_EVENTS_ID + bool "sssw monitor" help - Monitor to ensure schedule is not called with interrupt disabled. + Monitor to ensure sched_set_state to sleepable leads to sleeping and + sleeping tasks require wakeup. This monitor is part of the sched monitors collection. For further information, see: diff --git a/kernel/trace/rv/monitors/sssw/sssw.c b/kernel/trace/rv/monitors/sssw/sssw.c new file mode 100644 index 000000000000..84b8d890d9d4 --- /dev/null +++ b/kernel/trace/rv/monitors/sssw/sssw.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "sssw" + +#include <trace/events/sched.h> +#include <trace/events/signal.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "sssw.h" + +static struct rv_monitor rv_sssw; +DECLARE_DA_MON_PER_TASK(sssw, unsigned char); + +static void handle_sched_set_state(void *data, struct task_struct *tsk, int state) +{ + if (state == TASK_RUNNING) + da_handle_start_event_sssw(tsk, sched_set_state_runnable_sssw); + else + da_handle_event_sssw(tsk, sched_set_state_sleepable_sssw); +} + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + if (preempt) + da_handle_event_sssw(prev, sched_switch_preempt_sssw); + else if (prev_state == TASK_RUNNING) + da_handle_event_sssw(prev, sched_switch_yield_sssw); + else if (prev_state == TASK_RTLOCK_WAIT) + /* special case of sleeping task with racy conditions */ + da_handle_event_sssw(prev, sched_switch_blocking_sssw); + else + da_handle_event_sssw(prev, sched_switch_suspend_sssw); + da_handle_event_sssw(next, sched_switch_in_sssw); +} + +static void handle_sched_wakeup(void *data, struct task_struct *p) +{ + /* + * Wakeup can also lead to signal_wakeup although the system is + * actually runnable. The monitor can safely start with this event. + */ + da_handle_start_event_sssw(p, sched_wakeup_sssw); +} + +static void handle_signal_deliver(void *data, int sig, + struct kernel_siginfo *info, + struct k_sigaction *ka) +{ + da_handle_event_sssw(current, signal_deliver_sssw); +} + +static int enable_sssw(void) +{ + int retval; + + retval = da_monitor_init_sssw(); + if (retval) + return retval; + + rv_attach_trace_probe("sssw", sched_set_state_tp, handle_sched_set_state); + rv_attach_trace_probe("sssw", sched_switch, handle_sched_switch); + rv_attach_trace_probe("sssw", sched_wakeup, handle_sched_wakeup); + rv_attach_trace_probe("sssw", signal_deliver, handle_signal_deliver); + + return 0; +} + +static void disable_sssw(void) +{ + rv_sssw.enabled = 0; + + rv_detach_trace_probe("sssw", sched_set_state_tp, handle_sched_set_state); + rv_detach_trace_probe("sssw", sched_switch, handle_sched_switch); + rv_detach_trace_probe("sssw", sched_wakeup, handle_sched_wakeup); + rv_detach_trace_probe("sssw", signal_deliver, handle_signal_deliver); + + da_monitor_destroy_sssw(); +} + +static struct rv_monitor rv_sssw = { + .name = "sssw", + .description = "set state sleep and wakeup.", + .enable = enable_sssw, + .disable = disable_sssw, + .reset = da_monitor_reset_all_sssw, + .enabled = 0, +}; + +static int __init register_sssw(void) +{ + return rv_register_monitor(&rv_sssw, &rv_sched); +} + +static void __exit unregister_sssw(void) +{ + rv_unregister_monitor(&rv_sssw); +} + +module_init(register_sssw); +module_exit(unregister_sssw); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("sssw: set state sleep and wakeup."); diff --git a/kernel/trace/rv/monitors/sssw/sssw.h b/kernel/trace/rv/monitors/sssw/sssw.h new file mode 100644 index 000000000000..243d54050c94 --- /dev/null +++ b/kernel/trace/rv/monitors/sssw/sssw.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of sssw automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_sssw { + runnable_sssw = 0, + signal_wakeup_sssw, + sleepable_sssw, + sleeping_sssw, + state_max_sssw +}; + +#define INVALID_STATE state_max_sssw + +enum events_sssw { + sched_set_state_runnable_sssw = 0, + sched_set_state_sleepable_sssw, + sched_switch_blocking_sssw, + sched_switch_in_sssw, + sched_switch_preempt_sssw, + sched_switch_suspend_sssw, + sched_switch_yield_sssw, + sched_wakeup_sssw, + signal_deliver_sssw, + event_max_sssw +}; + +struct automaton_sssw { + char *state_names[state_max_sssw]; + char *event_names[event_max_sssw]; + unsigned char function[state_max_sssw][event_max_sssw]; + unsigned char initial_state; + bool final_states[state_max_sssw]; +}; + +static const struct automaton_sssw automaton_sssw = { + .state_names = { + "runnable", + "signal_wakeup", + "sleepable", + "sleeping" + }, + .event_names = { + "sched_set_state_runnable", + "sched_set_state_sleepable", + "sched_switch_blocking", + "sched_switch_in", + "sched_switch_preempt", + "sched_switch_suspend", + "sched_switch_yield", + "sched_wakeup", + "signal_deliver" + }, + .function = { + { + runnable_sssw, + sleepable_sssw, + sleeping_sssw, + runnable_sssw, + runnable_sssw, + INVALID_STATE, + runnable_sssw, + runnable_sssw, + runnable_sssw + }, + { + INVALID_STATE, + sleepable_sssw, + INVALID_STATE, + signal_wakeup_sssw, + signal_wakeup_sssw, + INVALID_STATE, + signal_wakeup_sssw, + signal_wakeup_sssw, + runnable_sssw + }, + { + runnable_sssw, + sleepable_sssw, + sleeping_sssw, + sleepable_sssw, + sleepable_sssw, + sleeping_sssw, + signal_wakeup_sssw, + runnable_sssw, + sleepable_sssw + }, + { + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + runnable_sssw, + INVALID_STATE + }, + }, + .initial_state = runnable_sssw, + .final_states = { 1, 0, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/sssw/sssw_trace.h b/kernel/trace/rv/monitors/sssw/sssw_trace.h new file mode 100644 index 000000000000..6c03cfc6960b --- /dev/null +++ b/kernel/trace/rv/monitors/sssw/sssw_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SSSW +DEFINE_EVENT(event_da_monitor_id, event_sssw, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_sssw, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); +#endif /* CONFIG_RV_MON_SSSW */ diff --git a/kernel/trace/rv/monitors/sts/Kconfig b/kernel/trace/rv/monitors/sts/Kconfig new file mode 100644 index 000000000000..7d1ff0f6fc91 --- /dev/null +++ b/kernel/trace/rv/monitors/sts/Kconfig @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_STS + depends on RV + depends on TRACE_IRQFLAGS + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "sts monitor" + help + Monitor to ensure relationships between scheduler and task switches + * the scheduler is called and returns with interrupts disabled + * each call to the scheduler has up to one switch + * switches only happen inside the scheduler + * each call to the scheduler disables interrupts to switch + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/sts/sts.c b/kernel/trace/rv/monitors/sts/sts.c new file mode 100644 index 000000000000..c4a9cd67c1d2 --- /dev/null +++ b/kernel/trace/rv/monitors/sts/sts.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "sts" + +#include <trace/events/sched.h> +#include <trace/events/irq.h> +#include <trace/events/preemptirq.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "sts.h" + +static struct rv_monitor rv_sts; +DECLARE_DA_MON_PER_CPU(sts, unsigned char); + +#ifdef CONFIG_X86_LOCAL_APIC +#include <asm/trace/irq_vectors.h> + +static void handle_vector_irq_entry(void *data, int vector) +{ + da_handle_event_sts(irq_entry_sts); +} + +static void attach_vector_irq(void) +{ + rv_attach_trace_probe("sts", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_attach_trace_probe("sts", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_attach_trace_probe("sts", reschedule_entry, handle_vector_irq_entry); + rv_attach_trace_probe("sts", call_function_entry, handle_vector_irq_entry); + rv_attach_trace_probe("sts", call_function_single_entry, handle_vector_irq_entry); + } +} + +static void detach_vector_irq(void) +{ + rv_detach_trace_probe("sts", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_detach_trace_probe("sts", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_detach_trace_probe("sts", reschedule_entry, handle_vector_irq_entry); + rv_detach_trace_probe("sts", call_function_entry, handle_vector_irq_entry); + rv_detach_trace_probe("sts", call_function_single_entry, handle_vector_irq_entry); + } +} + +#else +/* We assume irq_entry tracepoints are sufficient on other architectures */ +static void attach_vector_irq(void) { } +static void detach_vector_irq(void) { } +#endif + +static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_sts(irq_disable_sts); +} + +static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_sts(irq_enable_sts); +} + +static void handle_irq_entry(void *data, int irq, struct irqaction *action) +{ + da_handle_event_sts(irq_entry_sts); +} + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + da_handle_event_sts(sched_switch_sts); +} + +static void handle_schedule_entry(void *data, bool preempt) +{ + da_handle_event_sts(schedule_entry_sts); +} + +static void handle_schedule_exit(void *data, bool is_switch) +{ + da_handle_start_event_sts(schedule_exit_sts); +} + +static int enable_sts(void) +{ + int retval; + + retval = da_monitor_init_sts(); + if (retval) + return retval; + + rv_attach_trace_probe("sts", irq_disable, handle_irq_disable); + rv_attach_trace_probe("sts", irq_enable, handle_irq_enable); + rv_attach_trace_probe("sts", irq_handler_entry, handle_irq_entry); + rv_attach_trace_probe("sts", sched_switch, handle_sched_switch); + rv_attach_trace_probe("sts", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("sts", sched_exit_tp, handle_schedule_exit); + attach_vector_irq(); + + return 0; +} + +static void disable_sts(void) +{ + rv_sts.enabled = 0; + + rv_detach_trace_probe("sts", irq_disable, handle_irq_disable); + rv_detach_trace_probe("sts", irq_enable, handle_irq_enable); + rv_detach_trace_probe("sts", irq_handler_entry, handle_irq_entry); + rv_detach_trace_probe("sts", sched_switch, handle_sched_switch); + rv_detach_trace_probe("sts", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("sts", sched_exit_tp, handle_schedule_exit); + detach_vector_irq(); + + da_monitor_destroy_sts(); +} + +/* + * This is the monitor register section. + */ +static struct rv_monitor rv_sts = { + .name = "sts", + .description = "schedule implies task switch.", + .enable = enable_sts, + .disable = disable_sts, + .reset = da_monitor_reset_all_sts, + .enabled = 0, +}; + +static int __init register_sts(void) +{ + return rv_register_monitor(&rv_sts, &rv_sched); +} + +static void __exit unregister_sts(void) +{ + rv_unregister_monitor(&rv_sts); +} + +module_init(register_sts); +module_exit(unregister_sts); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("sts: schedule implies task switch."); diff --git a/kernel/trace/rv/monitors/sts/sts.h b/kernel/trace/rv/monitors/sts/sts.h new file mode 100644 index 000000000000..3368b6599a00 --- /dev/null +++ b/kernel/trace/rv/monitors/sts/sts.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of sts automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_sts { + can_sched_sts = 0, + cant_sched_sts, + disable_to_switch_sts, + enable_to_exit_sts, + in_irq_sts, + scheduling_sts, + switching_sts, + state_max_sts +}; + +#define INVALID_STATE state_max_sts + +enum events_sts { + irq_disable_sts = 0, + irq_enable_sts, + irq_entry_sts, + sched_switch_sts, + schedule_entry_sts, + schedule_exit_sts, + event_max_sts +}; + +struct automaton_sts { + char *state_names[state_max_sts]; + char *event_names[event_max_sts]; + unsigned char function[state_max_sts][event_max_sts]; + unsigned char initial_state; + bool final_states[state_max_sts]; +}; + +static const struct automaton_sts automaton_sts = { + .state_names = { + "can_sched", + "cant_sched", + "disable_to_switch", + "enable_to_exit", + "in_irq", + "scheduling", + "switching" + }, + .event_names = { + "irq_disable", + "irq_enable", + "irq_entry", + "sched_switch", + "schedule_entry", + "schedule_exit" + }, + .function = { + { + cant_sched_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + scheduling_sts, + INVALID_STATE + }, + { + INVALID_STATE, + can_sched_sts, + cant_sched_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE + }, + { + INVALID_STATE, + enable_to_exit_sts, + in_irq_sts, + switching_sts, + INVALID_STATE, + INVALID_STATE + }, + { + enable_to_exit_sts, + enable_to_exit_sts, + enable_to_exit_sts, + INVALID_STATE, + INVALID_STATE, + can_sched_sts + }, + { + INVALID_STATE, + scheduling_sts, + in_irq_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE + }, + { + disable_to_switch_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE + }, + { + INVALID_STATE, + enable_to_exit_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE + }, + }, + .initial_state = can_sched_sts, + .final_states = { 1, 0, 0, 0, 0, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/tss/tss_trace.h b/kernel/trace/rv/monitors/sts/sts_trace.h index 4619dbb50cc0..d78beb58d5b3 100644 --- a/kernel/trace/rv/monitors/tss/tss_trace.h +++ b/kernel/trace/rv/monitors/sts/sts_trace.h @@ -4,12 +4,12 @@ * Snippet to be included in rv_trace.h */ -#ifdef CONFIG_RV_MON_TSS -DEFINE_EVENT(event_da_monitor, event_tss, +#ifdef CONFIG_RV_MON_STS +DEFINE_EVENT(event_da_monitor, event_sts, TP_PROTO(char *state, char *event, char *next_state, bool final_state), TP_ARGS(state, event, next_state, final_state)); -DEFINE_EVENT(error_da_monitor, error_tss, +DEFINE_EVENT(error_da_monitor, error_sts, TP_PROTO(char *state, char *event), TP_ARGS(state, event)); -#endif /* CONFIG_RV_MON_TSS */ +#endif /* CONFIG_RV_MON_STS */ diff --git a/kernel/trace/rv/monitors/tss/tss.c b/kernel/trace/rv/monitors/tss/tss.c deleted file mode 100644 index 542787e6524f..000000000000 --- a/kernel/trace/rv/monitors/tss/tss.c +++ /dev/null @@ -1,91 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/ftrace.h> -#include <linux/tracepoint.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/rv.h> -#include <rv/instrumentation.h> -#include <rv/da_monitor.h> - -#define MODULE_NAME "tss" - -#include <trace/events/sched.h> -#include <rv_trace.h> -#include <monitors/sched/sched.h> - -#include "tss.h" - -static struct rv_monitor rv_tss; -DECLARE_DA_MON_PER_CPU(tss, unsigned char); - -static void handle_sched_switch(void *data, bool preempt, - struct task_struct *prev, - struct task_struct *next, - unsigned int prev_state) -{ - da_handle_event_tss(sched_switch_tss); -} - -static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) -{ - da_handle_event_tss(schedule_entry_tss); -} - -static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) -{ - da_handle_start_event_tss(schedule_exit_tss); -} - -static int enable_tss(void) -{ - int retval; - - retval = da_monitor_init_tss(); - if (retval) - return retval; - - rv_attach_trace_probe("tss", sched_switch, handle_sched_switch); - rv_attach_trace_probe("tss", sched_entry_tp, handle_schedule_entry); - rv_attach_trace_probe("tss", sched_exit_tp, handle_schedule_exit); - - return 0; -} - -static void disable_tss(void) -{ - rv_tss.enabled = 0; - - rv_detach_trace_probe("tss", sched_switch, handle_sched_switch); - rv_detach_trace_probe("tss", sched_entry_tp, handle_schedule_entry); - rv_detach_trace_probe("tss", sched_exit_tp, handle_schedule_exit); - - da_monitor_destroy_tss(); -} - -static struct rv_monitor rv_tss = { - .name = "tss", - .description = "task switch while scheduling.", - .enable = enable_tss, - .disable = disable_tss, - .reset = da_monitor_reset_all_tss, - .enabled = 0, -}; - -static int __init register_tss(void) -{ - rv_register_monitor(&rv_tss, &rv_sched); - return 0; -} - -static void __exit unregister_tss(void) -{ - rv_unregister_monitor(&rv_tss); -} - -module_init(register_tss); -module_exit(unregister_tss); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); -MODULE_DESCRIPTION("tss: task switch while scheduling."); diff --git a/kernel/trace/rv/monitors/tss/tss.h b/kernel/trace/rv/monitors/tss/tss.h deleted file mode 100644 index f0a36fda1b87..000000000000 --- a/kernel/trace/rv/monitors/tss/tss.h +++ /dev/null @@ -1,47 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Automatically generated C representation of tss automaton - * For further information about this format, see kernel documentation: - * Documentation/trace/rv/deterministic_automata.rst - */ - -enum states_tss { - thread_tss = 0, - sched_tss, - state_max_tss -}; - -#define INVALID_STATE state_max_tss - -enum events_tss { - sched_switch_tss = 0, - schedule_entry_tss, - schedule_exit_tss, - event_max_tss -}; - -struct automaton_tss { - char *state_names[state_max_tss]; - char *event_names[event_max_tss]; - unsigned char function[state_max_tss][event_max_tss]; - unsigned char initial_state; - bool final_states[state_max_tss]; -}; - -static const struct automaton_tss automaton_tss = { - .state_names = { - "thread", - "sched" - }, - .event_names = { - "sched_switch", - "schedule_entry", - "schedule_exit" - }, - .function = { - { INVALID_STATE, sched_tss, INVALID_STATE }, - { sched_tss, INVALID_STATE, thread_tss }, - }, - .initial_state = thread_tss, - .final_states = { 1, 0 }, -}; diff --git a/kernel/trace/rv/monitors/wip/Kconfig b/kernel/trace/rv/monitors/wip/Kconfig index e464b9294865..87a26195792b 100644 --- a/kernel/trace/rv/monitors/wip/Kconfig +++ b/kernel/trace/rv/monitors/wip/Kconfig @@ -2,7 +2,7 @@ # config RV_MON_WIP depends on RV - depends on PREEMPT_TRACER + depends on TRACE_PREEMPT_TOGGLE select DA_MON_EVENTS_IMPLICIT bool "wip monitor" help diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c index ed758fec8608..4b4e99615a11 100644 --- a/kernel/trace/rv/monitors/wip/wip.c +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -71,8 +71,7 @@ static struct rv_monitor rv_wip = { static int __init register_wip(void) { - rv_register_monitor(&rv_wip, NULL); - return 0; + return rv_register_monitor(&rv_wip, NULL); } static void __exit unregister_wip(void) diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c index 172f31c4b0f3..4145bea2729e 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.c +++ b/kernel/trace/rv/monitors/wwnr/wwnr.c @@ -70,8 +70,7 @@ static struct rv_monitor rv_wwnr = { static int __init register_wwnr(void) { - rv_register_monitor(&rv_wwnr, NULL); - return 0; + return rv_register_monitor(&rv_wwnr, NULL); } static void __exit unregister_wwnr(void) diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c index 0186ff4cbd0b..74c6bcc2c749 100644 --- a/kernel/trace/rv/reactor_panic.c +++ b/kernel/trace/rv/reactor_panic.c @@ -13,9 +13,13 @@ #include <linux/init.h> #include <linux/rv.h> -static void rv_panic_reaction(char *msg) +__printf(1, 2) static void rv_panic_reaction(const char *msg, ...) { - panic(msg); + va_list args; + + va_start(args, msg); + vpanic(msg, args); + va_end(args); } static struct rv_reactor rv_panic = { diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c index 178759dbf89f..2dae2916c05f 100644 --- a/kernel/trace/rv/reactor_printk.c +++ b/kernel/trace/rv/reactor_printk.c @@ -12,9 +12,13 @@ #include <linux/init.h> #include <linux/rv.h> -static void rv_printk_reaction(char *msg) +__printf(1, 2) static void rv_printk_reaction(const char *msg, ...) { - printk_deferred(msg); + va_list args; + + va_start(args, msg); + vprintk_deferred(msg, args); + va_end(args); } static struct rv_reactor rv_printk = { diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index e4077500a91d..bd7d56dbf6c2 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -143,7 +143,7 @@ #include <linux/init.h> #include <linux/slab.h> -#ifdef CONFIG_DA_MON_EVENTS +#ifdef CONFIG_RV_MON_EVENTS #define CREATE_TRACE_POINTS #include <rv_trace.h> #endif @@ -165,7 +165,7 @@ struct dentry *get_monitors_root(void) LIST_HEAD(rv_monitors_list); static int task_monitor_count; -static bool task_monitor_slots[RV_PER_TASK_MONITORS]; +static bool task_monitor_slots[CONFIG_RV_PER_TASK_MONITORS]; int rv_get_task_monitor_slot(void) { @@ -173,12 +173,12 @@ int rv_get_task_monitor_slot(void) lockdep_assert_held(&rv_interface_lock); - if (task_monitor_count == RV_PER_TASK_MONITORS) + if (task_monitor_count == CONFIG_RV_PER_TASK_MONITORS) return -EBUSY; task_monitor_count++; - for (i = 0; i < RV_PER_TASK_MONITORS; i++) { + for (i = 0; i < CONFIG_RV_PER_TASK_MONITORS; i++) { if (task_monitor_slots[i] == false) { task_monitor_slots[i] = true; return i; @@ -194,7 +194,7 @@ void rv_put_task_monitor_slot(int slot) { lockdep_assert_held(&rv_interface_lock); - if (slot < 0 || slot >= RV_PER_TASK_MONITORS) { + if (slot < 0 || slot >= CONFIG_RV_PER_TASK_MONITORS) { WARN_ONCE(1, "RV releasing an invalid slot!: %d\n", slot); return; } @@ -210,9 +210,9 @@ void rv_put_task_monitor_slot(int slot) * Monitors with a parent are nested, * Monitors without a parent could be standalone or containers. */ -bool rv_is_nested_monitor(struct rv_monitor_def *mdef) +bool rv_is_nested_monitor(struct rv_monitor *mon) { - return mdef->parent != NULL; + return mon->parent != NULL; } /* @@ -223,16 +223,16 @@ bool rv_is_nested_monitor(struct rv_monitor_def *mdef) * for enable()/disable(). Use this condition to find empty containers. * Keep both conditions in case we have some non-compliant containers. */ -bool rv_is_container_monitor(struct rv_monitor_def *mdef) +bool rv_is_container_monitor(struct rv_monitor *mon) { - struct rv_monitor_def *next; + struct rv_monitor *next; - if (list_is_last(&mdef->list, &rv_monitors_list)) + if (list_is_last(&mon->list, &rv_monitors_list)) return false; - next = list_next_entry(mdef, list); + next = list_next_entry(mon, list); - return next->parent == mdef->monitor || !mdef->monitor->enable; + return next->parent == mon || !mon->enable; } /* @@ -241,10 +241,10 @@ bool rv_is_container_monitor(struct rv_monitor_def *mdef) static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf, size_t count, loff_t *ppos) { - struct rv_monitor_def *mdef = filp->private_data; + struct rv_monitor *mon = filp->private_data; const char *buff; - buff = mdef->monitor->enabled ? "1\n" : "0\n"; + buff = mon->enabled ? "1\n" : "0\n"; return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1); } @@ -252,14 +252,14 @@ static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf /* * __rv_disable_monitor - disabled an enabled monitor */ -static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) +static int __rv_disable_monitor(struct rv_monitor *mon, bool sync) { lockdep_assert_held(&rv_interface_lock); - if (mdef->monitor->enabled) { - mdef->monitor->enabled = 0; - if (mdef->monitor->disable) - mdef->monitor->disable(); + if (mon->enabled) { + mon->enabled = 0; + if (mon->disable) + mon->disable(); /* * Wait for the execution of all events to finish. @@ -273,90 +273,90 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) return 0; } -static void rv_disable_single(struct rv_monitor_def *mdef) +static void rv_disable_single(struct rv_monitor *mon) { - __rv_disable_monitor(mdef, true); + __rv_disable_monitor(mon, true); } -static int rv_enable_single(struct rv_monitor_def *mdef) +static int rv_enable_single(struct rv_monitor *mon) { int retval; lockdep_assert_held(&rv_interface_lock); - if (mdef->monitor->enabled) + if (mon->enabled) return 0; - retval = mdef->monitor->enable(); + retval = mon->enable(); if (!retval) - mdef->monitor->enabled = 1; + mon->enabled = 1; return retval; } -static void rv_disable_container(struct rv_monitor_def *mdef) +static void rv_disable_container(struct rv_monitor *mon) { - struct rv_monitor_def *p = mdef; + struct rv_monitor *p = mon; int enabled = 0; list_for_each_entry_continue(p, &rv_monitors_list, list) { - if (p->parent != mdef->monitor) + if (p->parent != mon) break; enabled += __rv_disable_monitor(p, false); } if (enabled) tracepoint_synchronize_unregister(); - mdef->monitor->enabled = 0; + mon->enabled = 0; } -static int rv_enable_container(struct rv_monitor_def *mdef) +static int rv_enable_container(struct rv_monitor *mon) { - struct rv_monitor_def *p = mdef; + struct rv_monitor *p = mon; int retval = 0; list_for_each_entry_continue(p, &rv_monitors_list, list) { - if (retval || p->parent != mdef->monitor) + if (retval || p->parent != mon) break; retval = rv_enable_single(p); } if (retval) - rv_disable_container(mdef); + rv_disable_container(mon); else - mdef->monitor->enabled = 1; + mon->enabled = 1; return retval; } /** * rv_disable_monitor - disable a given runtime monitor - * @mdef: Pointer to the monitor definition structure. + * @mon: Pointer to the monitor definition structure. * * Returns 0 on success. */ -int rv_disable_monitor(struct rv_monitor_def *mdef) +int rv_disable_monitor(struct rv_monitor *mon) { - if (rv_is_container_monitor(mdef)) - rv_disable_container(mdef); + if (rv_is_container_monitor(mon)) + rv_disable_container(mon); else - rv_disable_single(mdef); + rv_disable_single(mon); return 0; } /** * rv_enable_monitor - enable a given runtime monitor - * @mdef: Pointer to the monitor definition structure. + * @mon: Pointer to the monitor definition structure. * * Returns 0 on success, error otherwise. */ -int rv_enable_monitor(struct rv_monitor_def *mdef) +int rv_enable_monitor(struct rv_monitor *mon) { int retval; - if (rv_is_container_monitor(mdef)) - retval = rv_enable_container(mdef); + if (rv_is_container_monitor(mon)) + retval = rv_enable_container(mon); else - retval = rv_enable_single(mdef); + retval = rv_enable_single(mon); return retval; } @@ -367,7 +367,7 @@ int rv_enable_monitor(struct rv_monitor_def *mdef) static ssize_t monitor_enable_write_data(struct file *filp, const char __user *user_buf, size_t count, loff_t *ppos) { - struct rv_monitor_def *mdef = filp->private_data; + struct rv_monitor *mon = filp->private_data; int retval; bool val; @@ -378,9 +378,9 @@ static ssize_t monitor_enable_write_data(struct file *filp, const char __user *u mutex_lock(&rv_interface_lock); if (val) - retval = rv_enable_monitor(mdef); + retval = rv_enable_monitor(mon); else - retval = rv_disable_monitor(mdef); + retval = rv_disable_monitor(mon); mutex_unlock(&rv_interface_lock); @@ -399,12 +399,12 @@ static const struct file_operations interface_enable_fops = { static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf, size_t count, loff_t *ppos) { - struct rv_monitor_def *mdef = filp->private_data; + struct rv_monitor *mon = filp->private_data; char buff[256]; memset(buff, 0, sizeof(buff)); - snprintf(buff, sizeof(buff), "%s\n", mdef->monitor->description); + snprintf(buff, sizeof(buff), "%s\n", mon->description); return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1); } @@ -419,37 +419,37 @@ static const struct file_operations interface_desc_fops = { * the monitor dir, where the specific options of the monitor * are exposed. */ -static int create_monitor_dir(struct rv_monitor_def *mdef, struct rv_monitor_def *parent) +static int create_monitor_dir(struct rv_monitor *mon, struct rv_monitor *parent) { struct dentry *root = parent ? parent->root_d : get_monitors_root(); - const char *name = mdef->monitor->name; + const char *name = mon->name; struct dentry *tmp; int retval; - mdef->root_d = rv_create_dir(name, root); - if (!mdef->root_d) + mon->root_d = rv_create_dir(name, root); + if (!mon->root_d) return -ENOMEM; - tmp = rv_create_file("enable", RV_MODE_WRITE, mdef->root_d, mdef, &interface_enable_fops); + tmp = rv_create_file("enable", RV_MODE_WRITE, mon->root_d, mon, &interface_enable_fops); if (!tmp) { retval = -ENOMEM; goto out_remove_root; } - tmp = rv_create_file("desc", RV_MODE_READ, mdef->root_d, mdef, &interface_desc_fops); + tmp = rv_create_file("desc", RV_MODE_READ, mon->root_d, mon, &interface_desc_fops); if (!tmp) { retval = -ENOMEM; goto out_remove_root; } - retval = reactor_populate_monitor(mdef); + retval = reactor_populate_monitor(mon); if (retval) goto out_remove_root; return 0; out_remove_root: - rv_remove(mdef->root_d); + rv_remove(mon->root_d); return retval; } @@ -458,13 +458,12 @@ out_remove_root: */ static int monitors_show(struct seq_file *m, void *p) { - struct rv_monitor_def *mon_def = p; + struct rv_monitor *mon = container_of(p, struct rv_monitor, list); - if (mon_def->parent) - seq_printf(m, "%s:%s\n", mon_def->parent->name, - mon_def->monitor->name); + if (mon->parent) + seq_printf(m, "%s:%s\n", mon->parent->name, mon->name); else - seq_printf(m, "%s\n", mon_def->monitor->name); + seq_printf(m, "%s\n", mon->name); return 0; } @@ -496,13 +495,13 @@ static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos) */ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos) { - struct rv_monitor_def *m_def = p; + struct rv_monitor *mon = p; (*pos)++; - list_for_each_entry_continue(m_def, &rv_monitors_list, list) { - if (m_def->monitor->enabled) - return m_def; + list_for_each_entry_continue(mon, &rv_monitors_list, list) { + if (mon->enabled) + return mon; } return NULL; @@ -510,7 +509,7 @@ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos) static void *enabled_monitors_start(struct seq_file *m, loff_t *pos) { - struct rv_monitor_def *m_def; + struct rv_monitor *mon; loff_t l; mutex_lock(&rv_interface_lock); @@ -518,15 +517,15 @@ static void *enabled_monitors_start(struct seq_file *m, loff_t *pos) if (list_empty(&rv_monitors_list)) return NULL; - m_def = list_entry(&rv_monitors_list, struct rv_monitor_def, list); + mon = list_entry(&rv_monitors_list, struct rv_monitor, list); for (l = 0; l <= *pos; ) { - m_def = enabled_monitors_next(m, m_def, &l); - if (!m_def) + mon = enabled_monitors_next(m, mon, &l); + if (!mon) break; } - return m_def; + return mon; } /* @@ -566,13 +565,13 @@ static const struct file_operations available_monitors_ops = { */ static void disable_all_monitors(void) { - struct rv_monitor_def *mdef; + struct rv_monitor *mon; int enabled = 0; mutex_lock(&rv_interface_lock); - list_for_each_entry(mdef, &rv_monitors_list, list) - enabled += __rv_disable_monitor(mdef, false); + list_for_each_entry(mon, &rv_monitors_list, list) + enabled += __rv_disable_monitor(mon, false); if (enabled) { /* @@ -598,7 +597,7 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user size_t count, loff_t *ppos) { char buff[MAX_RV_MONITOR_NAME_SIZE + 2]; - struct rv_monitor_def *mdef; + struct rv_monitor *mon; int retval = -EINVAL; bool enable = true; char *ptr, *tmp; @@ -633,17 +632,17 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user if (tmp) ptr = tmp+1; - list_for_each_entry(mdef, &rv_monitors_list, list) { - if (strcmp(ptr, mdef->monitor->name) != 0) + list_for_each_entry(mon, &rv_monitors_list, list) { + if (strcmp(ptr, mon->name) != 0) continue; /* * Monitor found! */ if (enable) - retval = rv_enable_monitor(mdef); + retval = rv_enable_monitor(mon); else - retval = rv_disable_monitor(mdef); + retval = rv_disable_monitor(mon); if (!retval) retval = count; @@ -702,11 +701,11 @@ static void turn_monitoring_off(void) static void reset_all_monitors(void) { - struct rv_monitor_def *mdef; + struct rv_monitor *mon; - list_for_each_entry(mdef, &rv_monitors_list, list) { - if (mdef->monitor->enabled && mdef->monitor->reset) - mdef->monitor->reset(); + list_for_each_entry(mon, &rv_monitors_list, list) { + if (mon->enabled && mon->reset) + mon->reset(); } } @@ -768,10 +767,9 @@ static const struct file_operations monitoring_on_fops = { .read = monitoring_on_read_data, }; -static void destroy_monitor_dir(struct rv_monitor_def *mdef) +static void destroy_monitor_dir(struct rv_monitor *mon) { - reactor_cleanup_monitor(mdef); - rv_remove(mdef->root_d); + rv_remove(mon->root_d); } /** @@ -783,7 +781,7 @@ static void destroy_monitor_dir(struct rv_monitor_def *mdef) */ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent) { - struct rv_monitor_def *r, *p = NULL; + struct rv_monitor *r; int retval = 0; if (strlen(monitor->name) >= MAX_RV_MONITOR_NAME_SIZE) { @@ -795,49 +793,31 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent) mutex_lock(&rv_interface_lock); list_for_each_entry(r, &rv_monitors_list, list) { - if (strcmp(monitor->name, r->monitor->name) == 0) { + if (strcmp(monitor->name, r->name) == 0) { pr_info("Monitor %s is already registered\n", monitor->name); retval = -EEXIST; goto out_unlock; } } - if (parent) { - list_for_each_entry(r, &rv_monitors_list, list) { - if (strcmp(parent->name, r->monitor->name) == 0) { - p = r; - break; - } - } - } - - if (p && rv_is_nested_monitor(p)) { + if (parent && rv_is_nested_monitor(parent)) { pr_info("Parent monitor %s is already nested, cannot nest further\n", parent->name); retval = -EINVAL; goto out_unlock; } - r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL); - if (!r) { - retval = -ENOMEM; - goto out_unlock; - } - - r->monitor = monitor; - r->parent = parent; + monitor->parent = parent; - retval = create_monitor_dir(r, p); - if (retval) { - kfree(r); - goto out_unlock; - } + retval = create_monitor_dir(monitor, parent); + if (retval) + return retval; /* keep children close to the parent for easier visualisation */ - if (p) - list_add(&r->list, &p->list); + if (parent) + list_add(&monitor->list, &parent->list); else - list_add_tail(&r->list, &rv_monitors_list); + list_add_tail(&monitor->list, &rv_monitors_list); out_unlock: mutex_unlock(&rv_interface_lock); @@ -852,17 +832,11 @@ out_unlock: */ int rv_unregister_monitor(struct rv_monitor *monitor) { - struct rv_monitor_def *ptr, *next; - mutex_lock(&rv_interface_lock); - list_for_each_entry_safe(ptr, next, &rv_monitors_list, list) { - if (strcmp(monitor->name, ptr->monitor->name) == 0) { - rv_disable_monitor(ptr); - list_del(&ptr->list); - destroy_monitor_dir(ptr); - } - } + rv_disable_monitor(monitor); + list_del(&monitor->list); + destroy_monitor_dir(monitor); mutex_unlock(&rv_interface_lock); return 0; diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h index 98fca0a1adbc..1485a70c1bf4 100644 --- a/kernel/trace/rv/rv.h +++ b/kernel/trace/rv/rv.h @@ -23,48 +23,21 @@ struct rv_interface { extern struct mutex rv_interface_lock; extern struct list_head rv_monitors_list; -#ifdef CONFIG_RV_REACTORS -struct rv_reactor_def { - struct list_head list; - struct rv_reactor *reactor; - /* protected by the monitor interface lock */ - int counter; -}; -#endif - -struct rv_monitor_def { - struct list_head list; - struct rv_monitor *monitor; - struct rv_monitor *parent; - struct dentry *root_d; -#ifdef CONFIG_RV_REACTORS - struct rv_reactor_def *rdef; - bool reacting; -#endif - bool task_monitor; -}; - struct dentry *get_monitors_root(void); -int rv_disable_monitor(struct rv_monitor_def *mdef); -int rv_enable_monitor(struct rv_monitor_def *mdef); -bool rv_is_container_monitor(struct rv_monitor_def *mdef); -bool rv_is_nested_monitor(struct rv_monitor_def *mdef); +int rv_disable_monitor(struct rv_monitor *mon); +int rv_enable_monitor(struct rv_monitor *mon); +bool rv_is_container_monitor(struct rv_monitor *mon); +bool rv_is_nested_monitor(struct rv_monitor *mon); #ifdef CONFIG_RV_REACTORS -int reactor_populate_monitor(struct rv_monitor_def *mdef); -void reactor_cleanup_monitor(struct rv_monitor_def *mdef); +int reactor_populate_monitor(struct rv_monitor *mon); int init_rv_reactors(struct dentry *root_dir); #else -static inline int reactor_populate_monitor(struct rv_monitor_def *mdef) +static inline int reactor_populate_monitor(struct rv_monitor *mon) { return 0; } -static inline void reactor_cleanup_monitor(struct rv_monitor_def *mdef) -{ - return; -} - static inline int init_rv_reactors(struct dentry *root_dir) { return 0; diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c index 9501ca886d83..d32859fec238 100644 --- a/kernel/trace/rv/rv_reactors.c +++ b/kernel/trace/rv/rv_reactors.c @@ -70,12 +70,12 @@ */ static LIST_HEAD(rv_reactors_list); -static struct rv_reactor_def *get_reactor_rdef_by_name(char *name) +static struct rv_reactor *get_reactor_rdef_by_name(char *name) { - struct rv_reactor_def *r; + struct rv_reactor *r; list_for_each_entry(r, &rv_reactors_list, list) { - if (strcmp(name, r->reactor->name) == 0) + if (strcmp(name, r->name) == 0) return r; } return NULL; @@ -86,9 +86,9 @@ static struct rv_reactor_def *get_reactor_rdef_by_name(char *name) */ static int reactors_show(struct seq_file *m, void *p) { - struct rv_reactor_def *rea_def = p; + struct rv_reactor *reactor = container_of(p, struct rv_reactor, list); - seq_printf(m, "%s\n", rea_def->reactor->name); + seq_printf(m, "%s\n", reactor->name); return 0; } @@ -138,13 +138,13 @@ static const struct file_operations available_reactors_ops = { */ static int monitor_reactor_show(struct seq_file *m, void *p) { - struct rv_monitor_def *mdef = m->private; - struct rv_reactor_def *rdef = p; + struct rv_monitor *mon = m->private; + struct rv_reactor *reactor = container_of(p, struct rv_reactor, list); - if (mdef->rdef == rdef) - seq_printf(m, "[%s]\n", rdef->reactor->name); + if (mon->reactor == reactor) + seq_printf(m, "[%s]\n", reactor->name); else - seq_printf(m, "%s\n", rdef->reactor->name); + seq_printf(m, "%s\n", reactor->name); return 0; } @@ -158,43 +158,37 @@ static const struct seq_operations monitor_reactors_seq_ops = { .show = monitor_reactor_show }; -static void monitor_swap_reactors_single(struct rv_monitor_def *mdef, - struct rv_reactor_def *rdef, - bool reacting, bool nested) +static void monitor_swap_reactors_single(struct rv_monitor *mon, + struct rv_reactor *reactor, + bool nested) { bool monitor_enabled; /* nothing to do */ - if (mdef->rdef == rdef) + if (mon->reactor == reactor) return; - monitor_enabled = mdef->monitor->enabled; + monitor_enabled = mon->enabled; if (monitor_enabled) - rv_disable_monitor(mdef); + rv_disable_monitor(mon); - /* swap reactor's usage */ - mdef->rdef->counter--; - rdef->counter++; - - mdef->rdef = rdef; - mdef->reacting = reacting; - mdef->monitor->react = rdef->reactor->react; + mon->reactor = reactor; + mon->react = reactor->react; /* enable only once if iterating through a container */ if (monitor_enabled && !nested) - rv_enable_monitor(mdef); + rv_enable_monitor(mon); } -static void monitor_swap_reactors(struct rv_monitor_def *mdef, - struct rv_reactor_def *rdef, bool reacting) +static void monitor_swap_reactors(struct rv_monitor *mon, struct rv_reactor *reactor) { - struct rv_monitor_def *p = mdef; + struct rv_monitor *p = mon; - if (rv_is_container_monitor(mdef)) + if (rv_is_container_monitor(mon)) list_for_each_entry_continue(p, &rv_monitors_list, list) { - if (p->parent != mdef->monitor) + if (p->parent != mon) break; - monitor_swap_reactors_single(p, rdef, reacting, true); + monitor_swap_reactors_single(p, reactor, true); } /* * This call enables and disables the monitor if they were active. @@ -202,7 +196,7 @@ static void monitor_swap_reactors(struct rv_monitor_def *mdef, * All nested monitors are enabled also if they were off, we may refine * this logic in the future. */ - monitor_swap_reactors_single(mdef, rdef, reacting, false); + monitor_swap_reactors_single(mon, reactor, false); } static ssize_t @@ -210,11 +204,10 @@ monitor_reactors_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { char buff[MAX_RV_REACTOR_NAME_SIZE + 2]; - struct rv_monitor_def *mdef; - struct rv_reactor_def *rdef; + struct rv_monitor *mon; + struct rv_reactor *reactor; struct seq_file *seq_f; int retval = -EINVAL; - bool enable; char *ptr; int len; @@ -237,22 +230,17 @@ monitor_reactors_write(struct file *file, const char __user *user_buf, * See monitor_reactors_open() */ seq_f = file->private_data; - mdef = seq_f->private; + mon = seq_f->private; mutex_lock(&rv_interface_lock); retval = -EINVAL; - list_for_each_entry(rdef, &rv_reactors_list, list) { - if (strcmp(ptr, rdef->reactor->name) != 0) + list_for_each_entry(reactor, &rv_reactors_list, list) { + if (strcmp(ptr, reactor->name) != 0) continue; - if (rdef == get_reactor_rdef_by_name("nop")) - enable = false; - else - enable = true; - - monitor_swap_reactors(mdef, rdef, enable); + monitor_swap_reactors(mon, reactor); retval = count; break; @@ -268,7 +256,7 @@ monitor_reactors_write(struct file *file, const char __user *user_buf, */ static int monitor_reactors_open(struct inode *inode, struct file *file) { - struct rv_monitor_def *mdef = inode->i_private; + struct rv_monitor *mon = inode->i_private; struct seq_file *seq_f; int ret; @@ -284,7 +272,7 @@ static int monitor_reactors_open(struct inode *inode, struct file *file) /* * Copy the create file "private" data to the seq_file private data. */ - seq_f->private = mdef; + seq_f->private = mon; return 0; }; @@ -299,23 +287,16 @@ static const struct file_operations monitor_reactors_ops = { static int __rv_register_reactor(struct rv_reactor *reactor) { - struct rv_reactor_def *r; + struct rv_reactor *r; list_for_each_entry(r, &rv_reactors_list, list) { - if (strcmp(reactor->name, r->reactor->name) == 0) { + if (strcmp(reactor->name, r->name) == 0) { pr_info("Reactor %s is already registered\n", reactor->name); return -EINVAL; } } - r = kzalloc(sizeof(struct rv_reactor_def), GFP_KERNEL); - if (!r) - return -ENOMEM; - - r->reactor = reactor; - r->counter = 0; - - list_add_tail(&r->list, &rv_reactors_list); + list_add_tail(&reactor->list, &rv_reactors_list); return 0; } @@ -350,30 +331,10 @@ int rv_register_reactor(struct rv_reactor *reactor) */ int rv_unregister_reactor(struct rv_reactor *reactor) { - struct rv_reactor_def *ptr, *next; - int ret = 0; - mutex_lock(&rv_interface_lock); - - list_for_each_entry_safe(ptr, next, &rv_reactors_list, list) { - if (strcmp(reactor->name, ptr->reactor->name) == 0) { - - if (!ptr->counter) { - list_del(&ptr->list); - } else { - printk(KERN_WARNING - "rv: the rv_reactor %s is in use by %d monitor(s)\n", - ptr->reactor->name, ptr->counter); - printk(KERN_WARNING "rv: the rv_reactor %s cannot be removed\n", - ptr->reactor->name); - ret = -EBUSY; - break; - } - } - } - + list_del(&reactor->list); mutex_unlock(&rv_interface_lock); - return ret; + return 0; } /* @@ -454,43 +415,30 @@ static const struct file_operations reacting_on_fops = { /** * reactor_populate_monitor - creates per monitor reactors file - * @mdef: monitor's definition. + * @mon: The monitor. * * Returns 0 if successful, error otherwise. */ -int reactor_populate_monitor(struct rv_monitor_def *mdef) +int reactor_populate_monitor(struct rv_monitor *mon) { struct dentry *tmp; - tmp = rv_create_file("reactors", RV_MODE_WRITE, mdef->root_d, mdef, &monitor_reactors_ops); + tmp = rv_create_file("reactors", RV_MODE_WRITE, mon->root_d, mon, &monitor_reactors_ops); if (!tmp) return -ENOMEM; /* * Configure as the rv_nop reactor. */ - mdef->rdef = get_reactor_rdef_by_name("nop"); - mdef->rdef->counter++; - mdef->reacting = false; + mon->reactor = get_reactor_rdef_by_name("nop"); return 0; } -/** - * reactor_cleanup_monitor - cleanup a monitor reference - * @mdef: monitor's definition. - */ -void reactor_cleanup_monitor(struct rv_monitor_def *mdef) -{ - lockdep_assert_held(&rv_interface_lock); - mdef->rdef->counter--; - WARN_ON_ONCE(mdef->rdef->counter < 0); -} - /* * Nop reactor register */ -static void rv_nop_reaction(char *msg) +__printf(1, 2) static void rv_nop_reaction(const char *msg, ...) { } diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h index 422b75f58891..4a6faddac614 100644 --- a/kernel/trace/rv/rv_trace.h +++ b/kernel/trace/rv/rv_trace.h @@ -16,24 +16,24 @@ DECLARE_EVENT_CLASS(event_da_monitor, TP_ARGS(state, event, next_state, final_state), TP_STRUCT__entry( - __array( char, state, MAX_DA_NAME_LEN ) - __array( char, event, MAX_DA_NAME_LEN ) - __array( char, next_state, MAX_DA_NAME_LEN ) - __field( bool, final_state ) + __string( state, state ) + __string( event, event ) + __string( next_state, next_state ) + __field( bool, final_state ) ), TP_fast_assign( - memcpy(__entry->state, state, MAX_DA_NAME_LEN); - memcpy(__entry->event, event, MAX_DA_NAME_LEN); - memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN); - __entry->final_state = final_state; + __assign_str(state); + __assign_str(event); + __assign_str(next_state); + __entry->final_state = final_state; ), - TP_printk("%s x %s -> %s %s", - __entry->state, - __entry->event, - __entry->next_state, - __entry->final_state ? "(final)" : "") + TP_printk("%s x %s -> %s%s", + __get_str(state), + __get_str(event), + __get_str(next_state), + __entry->final_state ? " (final)" : "") ); DECLARE_EVENT_CLASS(error_da_monitor, @@ -43,26 +43,26 @@ DECLARE_EVENT_CLASS(error_da_monitor, TP_ARGS(state, event), TP_STRUCT__entry( - __array( char, state, MAX_DA_NAME_LEN ) - __array( char, event, MAX_DA_NAME_LEN ) + __string( state, state ) + __string( event, event ) ), TP_fast_assign( - memcpy(__entry->state, state, MAX_DA_NAME_LEN); - memcpy(__entry->event, event, MAX_DA_NAME_LEN); + __assign_str(state); + __assign_str(event); ), TP_printk("event %s not expected in the state %s", - __entry->event, - __entry->state) + __get_str(event), + __get_str(state)) ); #include <monitors/wip/wip_trace.h> -#include <monitors/tss/tss_trace.h> #include <monitors/sco/sco_trace.h> #include <monitors/scpd/scpd_trace.h> #include <monitors/snep/snep_trace.h> -#include <monitors/sncid/sncid_trace.h> +#include <monitors/sts/sts_trace.h> +#include <monitors/opid/opid_trace.h> // Add new monitors based on CONFIG_DA_MON_EVENTS_IMPLICIT here #endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */ @@ -75,27 +75,27 @@ DECLARE_EVENT_CLASS(event_da_monitor_id, TP_ARGS(id, state, event, next_state, final_state), TP_STRUCT__entry( - __field( int, id ) - __array( char, state, MAX_DA_NAME_LEN ) - __array( char, event, MAX_DA_NAME_LEN ) - __array( char, next_state, MAX_DA_NAME_LEN ) - __field( bool, final_state ) + __field( int, id ) + __string( state, state ) + __string( event, event ) + __string( next_state, next_state ) + __field( bool, final_state ) ), TP_fast_assign( - memcpy(__entry->state, state, MAX_DA_NAME_LEN); - memcpy(__entry->event, event, MAX_DA_NAME_LEN); - memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN); - __entry->id = id; - __entry->final_state = final_state; + __assign_str(state); + __assign_str(event); + __assign_str(next_state); + __entry->id = id; + __entry->final_state = final_state; ), - TP_printk("%d: %s x %s -> %s %s", + TP_printk("%d: %s x %s -> %s%s", __entry->id, - __entry->state, - __entry->event, - __entry->next_state, - __entry->final_state ? "(final)" : "") + __get_str(state), + __get_str(event), + __get_str(next_state), + __entry->final_state ? " (final)" : "") ); DECLARE_EVENT_CLASS(error_da_monitor_id, @@ -105,32 +105,108 @@ DECLARE_EVENT_CLASS(error_da_monitor_id, TP_ARGS(id, state, event), TP_STRUCT__entry( - __field( int, id ) - __array( char, state, MAX_DA_NAME_LEN ) - __array( char, event, MAX_DA_NAME_LEN ) + __field( int, id ) + __string( state, state ) + __string( event, event ) ), TP_fast_assign( - memcpy(__entry->state, state, MAX_DA_NAME_LEN); - memcpy(__entry->event, event, MAX_DA_NAME_LEN); - __entry->id = id; + __assign_str(state); + __assign_str(event); + __entry->id = id; ), TP_printk("%d: event %s not expected in the state %s", __entry->id, - __entry->event, - __entry->state) + __get_str(event), + __get_str(state)) ); #include <monitors/wwnr/wwnr_trace.h> #include <monitors/snroc/snroc_trace.h> +#include <monitors/nrp/nrp_trace.h> +#include <monitors/sssw/sssw_trace.h> // Add new monitors based on CONFIG_DA_MON_EVENTS_ID here #endif /* CONFIG_DA_MON_EVENTS_ID */ +#ifdef CONFIG_LTL_MON_EVENTS_ID +DECLARE_EVENT_CLASS(event_ltl_monitor_id, + + TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next), + + TP_ARGS(task, states, atoms, next), + + TP_STRUCT__entry( + __string(comm, task->comm) + __field(pid_t, pid) + __string(states, states) + __string(atoms, atoms) + __string(next, next) + ), + + TP_fast_assign( + __assign_str(comm); + __entry->pid = task->pid; + __assign_str(states); + __assign_str(atoms); + __assign_str(next); + ), + + TP_printk("%s[%d]: (%s) x (%s) -> (%s)", __get_str(comm), __entry->pid, + __get_str(states), __get_str(atoms), __get_str(next)) +); + +DECLARE_EVENT_CLASS(error_ltl_monitor_id, + + TP_PROTO(struct task_struct *task), + + TP_ARGS(task), + + TP_STRUCT__entry( + __string(comm, task->comm) + __field(pid_t, pid) + ), + + TP_fast_assign( + __assign_str(comm); + __entry->pid = task->pid; + ), + + TP_printk("%s[%d]: violation detected", __get_str(comm), __entry->pid) +); +#include <monitors/pagefault/pagefault_trace.h> +#include <monitors/sleep/sleep_trace.h> +// Add new monitors based on CONFIG_LTL_MON_EVENTS_ID here +#endif /* CONFIG_LTL_MON_EVENTS_ID */ + +#ifdef CONFIG_RV_MON_MAINTENANCE_EVENTS +/* Tracepoint useful for monitors development, currenly only used in DA */ +TRACE_EVENT(rv_retries_error, + + TP_PROTO(char *name, char *event), + + TP_ARGS(name, event), + + TP_STRUCT__entry( + __string( name, name ) + __string( event, event ) + ), + + TP_fast_assign( + __assign_str(name); + __assign_str(event); + ), + + TP_printk(__stringify(MAX_DA_RETRY_RACING_EVENTS) + " retries reached for event %s, resetting monitor %s", + __get_str(event), __get_str(name)) +); +#endif /* CONFIG_RV_MON_MAINTENANCE_EVENTS */ #endif /* _TRACE_RV_H */ -/* This part ust be outside protection */ +/* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE rv_trace #include <trace/define_trace.h> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5b8db27fb6ef..7996f26c3f46 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -51,6 +51,7 @@ #include <linux/workqueue.h> #include <linux/sort.h> #include <linux/io.h> /* vmap_page_range() */ +#include <linux/fs_context.h> #include <asm/setup.h> /* COMMAND_LINE_SIZE */ @@ -120,6 +121,7 @@ static int tracing_disabled = 1; cpumask_var_t __read_mostly tracing_buffer_mask; +#define MAX_TRACER_SIZE 100 /* * ftrace_dump_on_oops - variable to dump ftrace buffer on oops * @@ -142,7 +144,40 @@ cpumask_var_t __read_mostly tracing_buffer_mask; char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0"; /* When set, tracing will stop when a WARN*() is hit */ -int __disable_trace_on_warning; +static int __disable_trace_on_warning; + +int tracepoint_printk_sysctl(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +static const struct ctl_table trace_sysctl_table[] = { + { + .procname = "ftrace_dump_on_oops", + .data = &ftrace_dump_on_oops, + .maxlen = MAX_TRACER_SIZE, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "traceoff_on_warning", + .data = &__disable_trace_on_warning, + .maxlen = sizeof(__disable_trace_on_warning), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tracepoint_printk", + .data = &tracepoint_printk, + .maxlen = sizeof(tracepoint_printk), + .mode = 0644, + .proc_handler = tracepoint_printk_sysctl, + }, +}; + +static int __init init_trace_sysctls(void) +{ + register_sysctl_init("kernel", trace_sysctl_table); + return 0; +} +subsys_initcall(init_trace_sysctls); #ifdef CONFIG_TRACE_EVAL_MAP_FILE /* Map of enums to their values, for "eval_map" file */ @@ -493,7 +528,8 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export); TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \ TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \ TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \ - TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK) + TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK | \ + TRACE_ITER_COPY_MARKER) /* trace_options that are only supported by global_trace */ #define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \ @@ -501,7 +537,8 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export); /* trace_flags that are default zero for instances */ #define ZEROED_TRACE_FLAGS \ - (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK) + (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK | \ + TRACE_ITER_COPY_MARKER) /* * The global_trace is the descriptor that holds the top-level tracing @@ -513,6 +550,9 @@ static struct trace_array global_trace = { static struct trace_array *printk_trace = &global_trace; +/* List of trace_arrays interested in the top level trace_marker */ +static LIST_HEAD(marker_copies); + static __always_inline bool printk_binsafe(struct trace_array *tr) { /* @@ -534,6 +574,28 @@ static void update_printk_trace(struct trace_array *tr) tr->trace_flags |= TRACE_ITER_TRACE_PRINTK; } +/* Returns true if the status of tr changed */ +static bool update_marker_trace(struct trace_array *tr, int enabled) +{ + lockdep_assert_held(&event_mutex); + + if (enabled) { + if (!list_empty(&tr->marker_list)) + return false; + + list_add_rcu(&tr->marker_list, &marker_copies); + tr->trace_flags |= TRACE_ITER_COPY_MARKER; + return true; + } + + if (list_empty(&tr->marker_list)) + return false; + + list_del_init(&tr->marker_list); + tr->trace_flags &= ~TRACE_ITER_COPY_MARKER; + return true; +} + void trace_set_ring_buffer_expanded(struct trace_array *tr) { if (!tr) @@ -1583,6 +1645,39 @@ void tracer_tracing_off(struct trace_array *tr) } /** + * tracer_tracing_disable() - temporary disable the buffer from write + * @tr: The trace array to disable its buffer for + * + * Expects trace_tracing_enable() to re-enable tracing. + * The difference between this and tracer_tracing_off() is that this + * is a counter and can nest, whereas, tracer_tracing_off() can + * be called multiple times and a single trace_tracing_on() will + * enable it. + */ +void tracer_tracing_disable(struct trace_array *tr) +{ + if (WARN_ON_ONCE(!tr->array_buffer.buffer)) + return; + + ring_buffer_record_disable(tr->array_buffer.buffer); +} + +/** + * tracer_tracing_enable() - counter part of tracer_tracing_disable() + * @tr: The trace array that had tracer_tracincg_disable() called on it + * + * This is called after tracer_tracing_disable() has been called on @tr, + * when it's safe to re-enable tracing. + */ +void tracer_tracing_enable(struct trace_array *tr) +{ + if (WARN_ON_ONCE(!tr->array_buffer.buffer)) + return; + + ring_buffer_record_enable(tr->array_buffer.buffer); +} + +/** * tracing_off - turn off tracing buffers * * This function stops the tracing buffers from recording data. @@ -4640,21 +4735,15 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->array_buffer->buffer, - cpu, GFP_KERNEL); - } - ring_buffer_read_prepare_sync(); - for_each_tracing_cpu(cpu) { - ring_buffer_read_start(iter->buffer_iter[cpu]); + ring_buffer_read_start(iter->array_buffer->buffer, + cpu, GFP_KERNEL); tracing_iter_reset(iter, cpu); } } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->array_buffer->buffer, - cpu, GFP_KERNEL); - ring_buffer_read_prepare_sync(); - ring_buffer_read_start(iter->buffer_iter[cpu]); + ring_buffer_read_start(iter->array_buffer->buffer, + cpu, GFP_KERNEL); tracing_iter_reset(iter, cpu); } @@ -5048,7 +5137,6 @@ int tracing_set_cpumask(struct trace_array *tr, */ if (cpumask_test_cpu(cpu, tr->tracing_cpumask) && !cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu); #ifdef CONFIG_TRACER_MAX_TRACE ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu); @@ -5056,7 +5144,6 @@ int tracing_set_cpumask(struct trace_array *tr, } if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu); #ifdef CONFIG_TRACER_MAX_TRACE ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu); @@ -5189,7 +5276,8 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) { if ((mask == TRACE_ITER_RECORD_TGID) || (mask == TRACE_ITER_RECORD_CMD) || - (mask == TRACE_ITER_TRACE_PRINTK)) + (mask == TRACE_ITER_TRACE_PRINTK) || + (mask == TRACE_ITER_COPY_MARKER)) lockdep_assert_held(&event_mutex); /* do nothing if flag is already set */ @@ -5220,6 +5308,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) } } + if (mask == TRACE_ITER_COPY_MARKER) + update_marker_trace(tr, enabled); + if (enabled) tr->trace_flags |= mask; else @@ -6004,6 +6095,7 @@ struct trace_mod_entry { }; struct trace_scratch { + unsigned int clock_id; unsigned long text_addr; unsigned long nr_entries; struct trace_mod_entry entries[]; @@ -6032,6 +6124,7 @@ unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr) struct trace_module_delta *module_delta; struct trace_scratch *tscratch; struct trace_mod_entry *entry; + unsigned long raddr; int idx = 0, nr_entries; /* If we don't have last boot delta, return the address */ @@ -6045,7 +6138,9 @@ unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr) module_delta = READ_ONCE(tr->module_delta); if (!module_delta || !tscratch->nr_entries || tscratch->entries[0].mod_addr > addr) { - return addr + tr->text_delta; + raddr = addr + tr->text_delta; + return __is_kernel(raddr) || is_kernel_core_data(raddr) || + is_kernel_rodata(raddr) ? raddr : addr; } /* Note that entries must be sorted. */ @@ -6116,6 +6211,7 @@ static void update_last_data(struct trace_array *tr) if (tr->scratch) { struct trace_scratch *tscratch = tr->scratch; + tscratch->clock_id = tr->clock_id; memset(tscratch->entries, 0, flex_array_size(tscratch, entries, tscratch->nr_entries)); tscratch->nr_entries = 0; @@ -6610,6 +6706,22 @@ static int tracing_wait_pipe(struct file *filp) return 1; } +static bool update_last_data_if_empty(struct trace_array *tr) +{ + if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + return false; + + if (!ring_buffer_empty(tr->array_buffer.buffer)) + return false; + + /* + * If the buffer contains the last boot data and all per-cpu + * buffers are empty, reset it from the kernel side. + */ + update_last_data(tr); + return true; +} + /* * Consumer reader. */ @@ -6641,6 +6753,9 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, } waitagain: + if (update_last_data_if_empty(iter->tr)) + return 0; + sret = tracing_wait_pipe(filp); if (sret <= 0) return sret; @@ -6824,7 +6939,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, ret = trace_seq_to_buffer(&iter->seq, page_address(spd.pages[i]), min((size_t)trace_seq_used(&iter->seq), - PAGE_SIZE)); + (size_t)PAGE_SIZE)); if (ret < 0) { __free_page(spd.pages[i]); break; @@ -7100,11 +7215,9 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) #define TRACE_MARKER_MAX_SIZE 4096 -static ssize_t -tracing_mark_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *fpos) +static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user *ubuf, + size_t cnt, unsigned long ip) { - struct trace_array *tr = filp->private_data; struct ring_buffer_event *event; enum event_trigger_type tt = ETT_NONE; struct trace_buffer *buffer; @@ -7118,18 +7231,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, #define FAULTED_STR "<faulted>" #define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */ - if (tracing_disabled) - return -EINVAL; - - if (!(tr->trace_flags & TRACE_ITER_MARKERS)) - return -EINVAL; - - if ((ssize_t)cnt < 0) - return -EINVAL; - - if (cnt > TRACE_MARKER_MAX_SIZE) - cnt = TRACE_MARKER_MAX_SIZE; - meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */ again: size = cnt + meta_size; @@ -7162,7 +7263,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, } entry = ring_buffer_event_data(event); - entry->ip = _THIS_IP_; + entry->ip = ip; len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); if (len) { @@ -7195,18 +7296,12 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, } static ssize_t -tracing_mark_raw_write(struct file *filp, const char __user *ubuf, +tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { struct trace_array *tr = filp->private_data; - struct ring_buffer_event *event; - struct trace_buffer *buffer; - struct raw_data_entry *entry; - ssize_t written; - int size; - int len; - -#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) + ssize_t written = -ENODEV; + unsigned long ip; if (tracing_disabled) return -EINVAL; @@ -7214,10 +7309,42 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, if (!(tr->trace_flags & TRACE_ITER_MARKERS)) return -EINVAL; - /* The marker must at least have a tag id */ - if (cnt < sizeof(unsigned int)) + if ((ssize_t)cnt < 0) return -EINVAL; + if (cnt > TRACE_MARKER_MAX_SIZE) + cnt = TRACE_MARKER_MAX_SIZE; + + /* The selftests expect this function to be the IP address */ + ip = _THIS_IP_; + + /* The global trace_marker can go to multiple instances */ + if (tr == &global_trace) { + guard(rcu)(); + list_for_each_entry_rcu(tr, &marker_copies, marker_list) { + written = write_marker_to_buffer(tr, ubuf, cnt, ip); + if (written < 0) + break; + } + } else { + written = write_marker_to_buffer(tr, ubuf, cnt, ip); + } + + return written; +} + +static ssize_t write_raw_marker_to_buffer(struct trace_array *tr, + const char __user *ubuf, size_t cnt) +{ + struct ring_buffer_event *event; + struct trace_buffer *buffer; + struct raw_data_entry *entry; + ssize_t written; + int size; + int len; + +#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) + size = sizeof(*entry) + cnt; if (cnt < FAULT_SIZE_ID) size += FAULT_SIZE_ID - cnt; @@ -7248,6 +7375,40 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, return written; } +static ssize_t +tracing_mark_raw_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + struct trace_array *tr = filp->private_data; + ssize_t written = -ENODEV; + +#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) + + if (tracing_disabled) + return -EINVAL; + + if (!(tr->trace_flags & TRACE_ITER_MARKERS)) + return -EINVAL; + + /* The marker must at least have a tag id */ + if (cnt < sizeof(unsigned int)) + return -EINVAL; + + /* The global trace_marker_raw can go to multiple instances */ + if (tr == &global_trace) { + guard(rcu)(); + list_for_each_entry_rcu(tr, &marker_copies, marker_list) { + written = write_raw_marker_to_buffer(tr, ubuf, cnt); + if (written < 0) + break; + } + } else { + written = write_raw_marker_to_buffer(tr, ubuf, cnt); + } + + return written; +} + static int tracing_clock_show(struct seq_file *m, void *v) { struct trace_array *tr = m->private; @@ -7292,6 +7453,12 @@ int tracing_set_clock(struct trace_array *tr, const char *clockstr) tracing_reset_online_cpus(&tr->max_buffer); #endif + if (tr->scratch && !(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) { + struct trace_scratch *tscratch = tr->scratch; + + tscratch->clock_id = i; + } + mutex_unlock(&trace_types_lock); return 0; @@ -8167,6 +8334,9 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (ret < 0) { if (trace_empty(iter) && !iter->closed) { + if (update_last_data_if_empty(iter->tr)) + return 0; + if ((filp->f_flags & O_NONBLOCK)) return -EAGAIN; @@ -8508,10 +8678,6 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) if (iter->tr->flags & TRACE_ARRAY_FL_MEMMAP) return -ENODEV; - /* Currently the boot mapped buffer is not supported for mmap */ - if (iter->tr->flags & TRACE_ARRAY_FL_BOOT) - return -ENODEV; - ret = get_snapshot_map(iter->tr); if (ret) return ret; @@ -9517,6 +9683,15 @@ static void setup_trace_scratch(struct trace_array *tr, /* Scan modules to make text delta for modules. */ module_for_each_mod(make_mod_delta, tr); + + /* Set trace_clock as the same of the previous boot. */ + if (tscratch->clock_id != tr->clock_id) { + if (tscratch->clock_id >= ARRAY_SIZE(trace_clocks) || + tracing_set_clock(tr, trace_clocks[tscratch->clock_id].name) < 0) { + pr_info("the previous trace_clock info is not valid."); + goto reset; + } + } return; reset: /* Invalid trace modules */ @@ -9741,6 +9916,7 @@ trace_array_create_systems(const char *name, const char *systems, INIT_LIST_HEAD(&tr->events); INIT_LIST_HEAD(&tr->hist_vars); INIT_LIST_HEAD(&tr->err_log); + INIT_LIST_HEAD(&tr->marker_list); #ifdef CONFIG_MODULES INIT_LIST_HEAD(&tr->mod_events); @@ -9900,6 +10076,9 @@ static int __remove_instance(struct trace_array *tr) if (printk_trace == tr) update_printk_trace(&global_trace); + if (update_marker_trace(tr, 0)) + synchronize_rcu(); + tracing_set_nop(tr); clear_ftrace_function_probes(tr); event_trace_del_tracer(tr); @@ -10075,6 +10254,8 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) { struct vfsmount *mnt; struct file_system_type *type; + struct fs_context *fc; + int ret; /* * To maintain backward compatibility for tools that mount @@ -10084,12 +10265,20 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) type = get_fs_type("tracefs"); if (!type) return NULL; - mnt = vfs_submount(mntpt, type, "tracefs", NULL); + + fc = fs_context_for_submount(type, mntpt); put_filesystem(type); - if (IS_ERR(mnt)) - return NULL; - mntget(mnt); + if (IS_ERR(fc)) + return ERR_CAST(fc); + + ret = vfs_parse_fs_string(fc, "source", + "tracefs", strlen("tracefs")); + if (!ret) + mnt = fc_mount(fc); + else + mnt = ERR_PTR(ret); + put_fs_context(fc); return mnt; } @@ -10448,7 +10637,7 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m static struct trace_iterator iter; unsigned int old_userobj; unsigned long flags; - int cnt = 0, cpu; + int cnt = 0; /* * Always turn off tracing when we dump. @@ -10465,9 +10654,8 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m /* Simulate the iterator */ trace_init_iter(&iter, tr); - for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); - } + /* While dumping, do not allow the buffer to be enable */ + tracer_tracing_disable(tr); old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ; @@ -10526,9 +10714,7 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m tr->trace_flags |= old_userobj; - for_each_tracing_cpu(cpu) { - atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); - } + tracer_tracing_enable(tr); local_irq_restore(flags); } @@ -10968,6 +11154,7 @@ __init static int tracer_alloc_buffers(void) INIT_LIST_HEAD(&global_trace.events); INIT_LIST_HEAD(&global_trace.hist_vars); INIT_LIST_HEAD(&global_trace.err_log); + list_add(&global_trace.marker_list, &marker_copies); list_add(&global_trace.list, &ftrace_trace_arrays); apply_trace_boot_options(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 79be1995db44..bd084953a98b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -183,8 +183,7 @@ struct trace_array; * the trace, etc.) */ struct trace_array_cpu { - atomic_t disabled; - void *buffer_page; /* ring buffer spare */ + local_t disabled; unsigned long entries; unsigned long saved_latency; @@ -404,6 +403,7 @@ struct trace_array { struct trace_options *topts; struct list_head systems; struct list_head events; + struct list_head marker_list; struct trace_event_file *trace_marker_file; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ /* one per_cpu trace_pipe can be opened by only one user */ @@ -665,12 +665,29 @@ bool tracing_is_disabled(void); bool tracer_tracing_is_on(struct trace_array *tr); void tracer_tracing_on(struct trace_array *tr); void tracer_tracing_off(struct trace_array *tr); +void tracer_tracing_disable(struct trace_array *tr); +void tracer_tracing_enable(struct trace_array *tr); struct dentry *trace_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops); + +/** + * tracer_tracing_is_on_cpu - show real state of ring buffer enabled on for a cpu + * @tr : the trace array to know if ring buffer is enabled + * @cpu: The cpu buffer to check if enabled + * + * Shows real state of the per CPU buffer if it is enabled or not. + */ +static inline bool tracer_tracing_is_on_cpu(struct trace_array *tr, int cpu) +{ + if (tr->array_buffer.buffer) + return ring_buffer_record_is_on_cpu(tr->array_buffer.buffer, cpu); + return false; +} + int tracing_init_dentry(void); struct ring_buffer_event; @@ -1368,6 +1385,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(MARKERS, "markers"), \ C(EVENT_FORK, "event-fork"), \ C(TRACE_PRINTK, "trace_printk_dest"), \ + C(COPY_MARKER, "copy_trace_marker"),\ C(PAUSE_ON_TRACE, "pause-on-trace"), \ C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \ FUNCTION_FLAGS \ @@ -1772,6 +1790,9 @@ extern int event_enable_register_trigger(char *glob, extern void event_enable_unregister_trigger(char *glob, struct event_trigger_data *test, struct trace_event_file *file); +extern struct event_trigger_data * +trigger_data_alloc(struct event_command *cmd_ops, char *cmd, char *param, + void *private_data); extern void trigger_data_free(struct event_trigger_data *data); extern int event_trigger_init(struct event_trigger_data *data); extern int trace_event_trigger_enable_disable(struct trace_event_file *file, @@ -1798,11 +1819,6 @@ extern bool event_trigger_check_remove(const char *glob); extern bool event_trigger_empty_param(const char *param); extern int event_trigger_separate_filter(char *param_and_filter, char **param, char **filter, bool param_required); -extern struct event_trigger_data * -event_trigger_alloc(struct event_command *cmd_ops, - char *cmd, - char *param, - void *private_data); extern int event_trigger_parse_num(char *trigger, struct event_trigger_data *trigger_data); extern int event_trigger_set_filter(struct event_command *cmd_ops, diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 6d08a5523ce0..6809b370e991 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -32,7 +32,6 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) { struct trace_array *tr = branch_tracer; struct trace_buffer *buffer; - struct trace_array_cpu *data; struct ring_buffer_event *event; struct trace_branch *entry; unsigned long flags; @@ -54,8 +53,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) raw_local_irq_save(flags); current->trace_recursion |= TRACE_BRANCH_BIT; - data = this_cpu_ptr(tr->array_buffer.data); - if (atomic_read(&data->disabled)) + if (!tracer_tracing_is_on_cpu(tr, raw_smp_processor_id())) goto out; trace_ctx = tracing_gen_ctx_flags(flags); diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 4ef4df6623a8..de294ae2c5c5 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -97,11 +97,11 @@ FTRACE_ENTRY_PACKED(fgraph_retaddr_entry, fgraph_retaddr_ent_entry, F_STRUCT( __field_struct( struct fgraph_retaddr_ent, graph_ent ) __field_packed( unsigned long, graph_ent, func ) - __field_packed( int, graph_ent, depth ) + __field_packed( unsigned int, graph_ent, depth ) __field_packed( unsigned long, graph_ent, retaddr ) ), - F_printk("--> %ps (%d) <- %ps", (void *)__entry->func, __entry->depth, + F_printk("--> %ps (%u) <- %ps", (void *)__entry->func, __entry->depth, (void *)__entry->retaddr) ); @@ -124,13 +124,13 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, __field_struct( struct ftrace_graph_ret, ret ) __field_packed( unsigned long, ret, func ) __field_packed( unsigned long, ret, retval ) - __field_packed( int, ret, depth ) + __field_packed( unsigned int, ret, depth ) __field_packed( unsigned int, ret, overrun ) __field(unsigned long long, calltime ) __field(unsigned long long, rettime ) ), - F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d retval: %lx", + F_printk("<-- %ps (%u) (start: %llx end: %llx) over: %u retval: %lx", (void *)__entry->func, __entry->depth, __entry->calltime, __entry->rettime, __entry->depth, __entry->retval) @@ -146,13 +146,13 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, F_STRUCT( __field_struct( struct ftrace_graph_ret, ret ) __field_packed( unsigned long, ret, func ) - __field_packed( int, ret, depth ) + __field_packed( unsigned int, ret, depth ) __field_packed( unsigned int, ret, overrun ) __field(unsigned long long, calltime ) __field(unsigned long long, rettime ) ), - F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d", + F_printk("<-- %ps (%u) (start: %llx end: %llx) over: %u", (void *)__entry->func, __entry->depth, __entry->calltime, __entry->rettime, __entry->depth) diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 916555f0de81..a1d402124836 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -9,14 +9,15 @@ * Copyright (C) 2021, VMware Inc, Tzvetomir Stoyanov tz.stoyanov@gmail.com> * */ +#include <linux/cleanup.h> +#include <linux/ftrace.h> #include <linux/module.h> #include <linux/mutex.h> -#include <linux/ftrace.h> #include "trace_dynevent.h" #include "trace_probe.h" -#include "trace_probe_tmpl.h" #include "trace_probe_kernel.h" +#include "trace_probe_tmpl.h" #define EPROBE_EVENT_SYSTEM "eprobes" @@ -343,10 +344,15 @@ get_event_field(struct fetch_insn *code, void *rec) val = *(unsigned int *)addr; break; default: - if (field->is_signed) - val = *(long *)addr; - else - val = *(unsigned long *)addr; + if (field->size == sizeof(long)) { + if (field->is_signed) + val = *(long *)addr; + else + val = *(unsigned long *)addr; + break; + } + /* This is an array, point to the addr itself */ + val = (unsigned long)addr; break; } return val; @@ -797,18 +803,20 @@ find_and_get_event(const char *system, const char *event_name) static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i) { - struct traceprobe_parse_context ctx = { - .event = ep->event, - .flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT, - }; + struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL; int ret; - ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], &ctx); + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + ctx->event = ep->event; + ctx->flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT; + + ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], ctx); /* Handle symbols "@" */ if (!ret) ret = traceprobe_update_arg(&ep->tp.args[i]); - traceprobe_finish_parse(&ctx); return ret; } @@ -869,10 +877,10 @@ static int __trace_eprobe_create(int argc, const char *argv[]) const char *event = NULL, *group = EPROBE_EVENT_SYSTEM; const char *sys_event = NULL, *sys_name = NULL; struct trace_event_call *event_call; + char *buf1 __free(kfree) = NULL; + char *buf2 __free(kfree) = NULL; + char *gbuf __free(kfree) = NULL; struct trace_eprobe *ep = NULL; - char buf1[MAX_EVENT_NAME_LEN]; - char buf2[MAX_EVENT_NAME_LEN]; - char gbuf[MAX_EVENT_NAME_LEN]; int ret = 0, filter_idx = 0; int i, filter_cnt; @@ -883,6 +891,9 @@ static int __trace_eprobe_create(int argc, const char *argv[]) event = strchr(&argv[0][1], ':'); if (event) { + gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!gbuf) + goto mem_error; event++; ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); @@ -892,6 +903,11 @@ static int __trace_eprobe_create(int argc, const char *argv[]) trace_probe_log_set_index(1); sys_event = argv[1]; + + buf2 = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!buf2) + goto mem_error; + ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0); if (ret || !sys_event || !sys_name) { trace_probe_log_err(0, NO_EVENT_INFO); @@ -899,7 +915,9 @@ static int __trace_eprobe_create(int argc, const char *argv[]) } if (!event) { - strscpy(buf1, sys_event, MAX_EVENT_NAME_LEN); + buf1 = kstrdup(sys_event, GFP_KERNEL); + if (!buf1) + goto mem_error; event = buf1; } @@ -972,6 +990,9 @@ static int __trace_eprobe_create(int argc, const char *argv[]) trace_probe_log_clear(); return ret; +mem_error: + ret = -ENOMEM; + goto error; parse_error: ret = -EINVAL; error: diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 069e92856bda..d01e5c910ce1 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -400,6 +400,20 @@ static bool process_string(const char *fmt, int len, struct trace_event_call *ca return true; } +static void handle_dereference_arg(const char *arg_str, u64 string_flags, int len, + u64 *dereference_flags, int arg, + struct trace_event_call *call) +{ + if (string_flags & (1ULL << arg)) { + if (process_string(arg_str, len, call)) + *dereference_flags &= ~(1ULL << arg); + } else if (process_pointer(arg_str, len, call)) + *dereference_flags &= ~(1ULL << arg); + else + pr_warn("TRACE EVENT ERROR: Bad dereference argument: '%.*s'\n", + len, arg_str); +} + /* * Examine the print fmt of the event looking for unsafe dereference * pointers using %p* that could be recorded in the trace event and @@ -563,11 +577,9 @@ static void test_event_printk(struct trace_event_call *call) } if (dereference_flags & (1ULL << arg)) { - if (string_flags & (1ULL << arg)) { - if (process_string(fmt + start_arg, e - start_arg, call)) - dereference_flags &= ~(1ULL << arg); - } else if (process_pointer(fmt + start_arg, e - start_arg, call)) - dereference_flags &= ~(1ULL << arg); + handle_dereference_arg(fmt + start_arg, string_flags, + e - start_arg, + &dereference_flags, arg, call); } start_arg = i; @@ -578,11 +590,9 @@ static void test_event_printk(struct trace_event_call *call) } if (dereference_flags & (1ULL << arg)) { - if (string_flags & (1ULL << arg)) { - if (process_string(fmt + start_arg, i - start_arg, call)) - dereference_flags &= ~(1ULL << arg); - } else if (process_pointer(fmt + start_arg, i - start_arg, call)) - dereference_flags &= ~(1ULL << arg); + handle_dereference_arg(fmt + start_arg, string_flags, + i - start_arg, + &dereference_flags, arg, call); } /* @@ -622,7 +632,6 @@ EXPORT_SYMBOL_GPL(trace_event_raw_init); bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) { struct trace_array *tr = trace_file->tr; - struct trace_array_cpu *data; struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; @@ -632,9 +641,11 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) if (!pid_list && !no_pid_list) return false; - data = this_cpu_ptr(tr->array_buffer.data); - - return data->ignore_pid; + /* + * This is recorded at every sched_switch for this task. + * Thus, even if the task migrates the ignore value will be the same. + */ + return this_cpu_read(tr->array_buffer.data->ignore_pid) != 0; } EXPORT_SYMBOL_GPL(trace_event_ignore_this_pid); @@ -3125,7 +3136,10 @@ __register_event(struct trace_event_call *call, struct module *mod) if (ret < 0) return ret; + down_write(&trace_event_sem); list_add(&call->list, &ftrace_events); + up_write(&trace_event_sem); + if (call->flags & TRACE_EVENT_FL_DYNAMIC) atomic_set(&call->refcnt, 0); else @@ -3739,6 +3753,8 @@ __trace_add_event_dirs(struct trace_array *tr) struct trace_event_call *call; int ret; + lockdep_assert_held(&trace_event_sem); + list_for_each_entry(call, &ftrace_events, list) { ret = __trace_add_new_event(call, tr); if (ret < 0) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 2048560264bb..e4581e10782b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1250,7 +1250,9 @@ static void append_filter_err(struct trace_array *tr, static inline struct event_filter *event_filter(struct trace_event_file *file) { - return file->filter; + return rcu_dereference_protected(file->filter, + lockdep_is_held(&event_mutex)); + } /* caller must hold event_mutex */ @@ -1320,7 +1322,7 @@ void free_event_filter(struct event_filter *filter) static inline void __remove_filter(struct trace_event_file *file) { filter_disable(file); - remove_filter_string(file->filter); + remove_filter_string(event_filter(file)); } static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir, @@ -1335,22 +1337,137 @@ static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir, } } +struct filter_list { + struct list_head list; + struct event_filter *filter; +}; + +struct filter_head { + struct list_head list; + struct rcu_head rcu; +}; + + +static void free_filter_list(struct rcu_head *rhp) +{ + struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu); + struct filter_list *filter_item, *tmp; + + list_for_each_entry_safe(filter_item, tmp, &filter_list->list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); + } + kfree(filter_list); +} + +static void free_filter_list_tasks(struct rcu_head *rhp) +{ + call_rcu(rhp, free_filter_list); +} + +/* + * The tracepoint_synchronize_unregister() is a double rcu call. + * It calls synchronize_rcu_tasks_trace() followed by synchronize_rcu(). + * Instead of waiting for it, simply call these via the call_rcu*() + * variants. + */ +static void delay_free_filter(struct filter_head *head) +{ + call_rcu_tasks_trace(&head->rcu, free_filter_list_tasks); +} + +static void try_delay_free_filter(struct event_filter *filter) +{ + struct filter_head *head; + struct filter_list *item; + + head = kmalloc(sizeof(*head), GFP_KERNEL); + if (!head) + goto free_now; + + INIT_LIST_HEAD(&head->list); + + item = kmalloc(sizeof(*item), GFP_KERNEL); + if (!item) { + kfree(head); + goto free_now; + } + + item->filter = filter; + list_add_tail(&item->list, &head->list); + delay_free_filter(head); + return; + + free_now: + /* Make sure the filter is not being used */ + tracepoint_synchronize_unregister(); + __free_filter(filter); +} + static inline void __free_subsystem_filter(struct trace_event_file *file) { - __free_filter(file->filter); + __free_filter(event_filter(file)); file->filter = NULL; } +static inline void event_set_filter(struct trace_event_file *file, + struct event_filter *filter) +{ + rcu_assign_pointer(file->filter, filter); +} + +static inline void event_clear_filter(struct trace_event_file *file) +{ + RCU_INIT_POINTER(file->filter, NULL); +} + static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, - struct trace_array *tr) + struct trace_array *tr, + struct event_filter *filter) { struct trace_event_file *file; + struct filter_head *head; + struct filter_list *item; + + head = kmalloc(sizeof(*head), GFP_KERNEL); + if (!head) + goto free_now; + + INIT_LIST_HEAD(&head->list); list_for_each_entry(file, &tr->events, list) { if (file->system != dir) continue; + item = kmalloc(sizeof(*item), GFP_KERNEL); + if (!item) + goto free_now; + item->filter = event_filter(file); + list_add_tail(&item->list, &head->list); + event_clear_filter(file); + } + + item = kmalloc(sizeof(*item), GFP_KERNEL); + if (!item) + goto free_now; + + item->filter = filter; + list_add_tail(&item->list, &head->list); + + delay_free_filter(head); + return; + free_now: + tracepoint_synchronize_unregister(); + + if (head) + free_filter_list(&head->rcu); + + list_for_each_entry(file, &tr->events, list) { + if (file->system != dir || !file->filter) + continue; __free_subsystem_filter(file); } + __free_filter(filter); } int filter_assign_type(const char *type) @@ -2120,22 +2237,6 @@ static inline void event_set_filtered_flag(struct trace_event_file *file) trace_buffered_event_enable(); } -static inline void event_set_filter(struct trace_event_file *file, - struct event_filter *filter) -{ - rcu_assign_pointer(file->filter, filter); -} - -static inline void event_clear_filter(struct trace_event_file *file) -{ - RCU_INIT_POINTER(file->filter, NULL); -} - -struct filter_list { - struct list_head list; - struct event_filter *filter; -}; - static int process_system_preds(struct trace_subsystem_dir *dir, struct trace_array *tr, struct filter_parse_error *pe, @@ -2144,11 +2245,16 @@ static int process_system_preds(struct trace_subsystem_dir *dir, struct trace_event_file *file; struct filter_list *filter_item; struct event_filter *filter = NULL; - struct filter_list *tmp; - LIST_HEAD(filter_list); + struct filter_head *filter_list; bool fail = true; int err; + filter_list = kmalloc(sizeof(*filter_list), GFP_KERNEL); + if (!filter_list) + return -ENOMEM; + + INIT_LIST_HEAD(&filter_list->list); + list_for_each_entry(file, &tr->events, list) { if (file->system != dir) @@ -2175,7 +2281,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, if (!filter_item) goto fail_mem; - list_add_tail(&filter_item->list, &filter_list); + list_add_tail(&filter_item->list, &filter_list->list); /* * Regardless of if this returned an error, we still * replace the filter for the call. @@ -2195,31 +2301,22 @@ static int process_system_preds(struct trace_subsystem_dir *dir, * Do a synchronize_rcu() and to ensure all calls are * done with them before we free them. */ - tracepoint_synchronize_unregister(); - list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { - __free_filter(filter_item->filter); - list_del(&filter_item->list); - kfree(filter_item); - } + delay_free_filter(filter_list); return 0; fail: /* No call succeeded */ - list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { - list_del(&filter_item->list); - kfree(filter_item); - } + free_filter_list(&filter_list->rcu); parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); return -EINVAL; fail_mem: __free_filter(filter); + /* If any call succeeded, we still need to sync */ if (!fail) - tracepoint_synchronize_unregister(); - list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { - __free_filter(filter_item->filter); - list_del(&filter_item->list); - kfree(filter_item); - } + delay_free_filter(filter_list); + else + free_filter_list(&filter_list->rcu); + return -ENOMEM; } @@ -2361,9 +2458,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) event_clear_filter(file); - /* Make sure the filter is not being used */ - tracepoint_synchronize_unregister(); - __free_filter(filter); + try_delay_free_filter(filter); return 0; } @@ -2387,11 +2482,8 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) event_set_filter(file, filter); - if (tmp) { - /* Make sure the call is done with the filter */ - tracepoint_synchronize_unregister(); - __free_filter(tmp); - } + if (tmp) + try_delay_free_filter(tmp); } return err; @@ -2417,9 +2509,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, filter = system->filter; system->filter = NULL; /* Ensure all filters are no longer used */ - tracepoint_synchronize_unregister(); - filter_free_subsystem_filters(dir, tr); - __free_filter(filter); + filter_free_subsystem_filters(dir, tr, filter); return 0; } @@ -2810,6 +2900,10 @@ static __init int ftrace_test_event_filter(void) if (i == DATA_CNT) printk(KERN_CONT "OK\n"); + /* Need to call ftrace_test_filter to prevent a warning */ + if (!trace_ftrace_test_filter_enabled()) + trace_ftrace_test_filter(1, 2, 3, 4, 5, 6, 7, 8); + return 0; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1260c23cfa5f..1d536219b624 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -114,6 +114,7 @@ enum hist_field_fn { HIST_FIELD_FN_BUCKET, HIST_FIELD_FN_TIMESTAMP, HIST_FIELD_FN_CPU, + HIST_FIELD_FN_COMM, HIST_FIELD_FN_STRING, HIST_FIELD_FN_DYNSTRING, HIST_FIELD_FN_RELDYNSTRING, @@ -506,6 +507,7 @@ enum hist_field_flags { HIST_FIELD_FL_CONST = 1 << 18, HIST_FIELD_FL_PERCENT = 1 << 19, HIST_FIELD_FL_GRAPH = 1 << 20, + HIST_FIELD_FL_COMM = 1 << 21, }; struct var_defs { @@ -885,6 +887,15 @@ static u64 hist_field_cpu(struct hist_field *hist_field, return cpu; } +static u64 hist_field_comm(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + return (u64)(unsigned long)current->comm; +} + /** * check_field_for_var_ref - Check if a VAR_REF field references a variable * @hist_field: The VAR_REF field to check @@ -1338,6 +1349,8 @@ static const char *hist_field_name(struct hist_field *field, field_name = hist_field_name(field->operands[0], ++level); else if (field->flags & HIST_FIELD_FL_CPU) field_name = "common_cpu"; + else if (field->flags & HIST_FIELD_FL_COMM) + field_name = "common_comm"; else if (field->flags & HIST_FIELD_FL_EXPR || field->flags & HIST_FIELD_FL_VAR_REF) { if (field->system) { @@ -2015,6 +2028,13 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, goto out; } + if (flags & HIST_FIELD_FL_COMM) { + hist_field->fn_num = HIST_FIELD_FN_COMM; + hist_field->size = MAX_FILTER_STR_VAL; + hist_field->type = "char[]"; + goto out; + } + if (WARN_ON_ONCE(!field)) goto out; @@ -2359,9 +2379,11 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, hist_data->attrs->ts_in_usecs = true; } else if (strcmp(field_name, "common_stacktrace") == 0) { *flags |= HIST_FIELD_FL_STACKTRACE; - } else if (strcmp(field_name, "common_cpu") == 0) + } else if (strcmp(field_name, "common_cpu") == 0) { *flags |= HIST_FIELD_FL_CPU; - else if (strcmp(field_name, "hitcount") == 0) + } else if (strcmp(field_name, "common_comm") == 0) { + *flags |= HIST_FIELD_FL_COMM | HIST_FIELD_FL_STRING; + } else if (strcmp(field_name, "hitcount") == 0) *flags |= HIST_FIELD_FL_HITCOUNT; else { field = trace_find_event_field(file->event_call, field_name); @@ -2377,6 +2399,8 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, *flags |= HIST_FIELD_FL_CPU; } else if (field && field->filter_type == FILTER_STACKTRACE) { *flags |= HIST_FIELD_FL_STACKTRACE; + } else if (field && field->filter_type == FILTER_COMM) { + *flags |= HIST_FIELD_FL_COMM | HIST_FIELD_FL_STRING; } else { hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name)); @@ -4327,6 +4351,8 @@ static u64 hist_fn_call(struct hist_field *hist_field, return hist_field_timestamp(hist_field, elt, buffer, rbe, event); case HIST_FIELD_FN_CPU: return hist_field_cpu(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_COMM: + return hist_field_comm(hist_field, elt, buffer, rbe, event); case HIST_FIELD_FN_STRING: return hist_field_string(hist_field, elt, buffer, rbe, event); case HIST_FIELD_FN_DYNSTRING: @@ -5212,22 +5238,25 @@ static inline void add_to_key(char *compound_key, void *key, size_t size = key_field->size; if (key_field->flags & HIST_FIELD_FL_STRING) { - struct ftrace_event_field *field; - field = key_field->field; - if (field->filter_type == FILTER_DYN_STRING || - field->filter_type == FILTER_RDYN_STRING) - size = *(u32 *)(rec + field->offset) >> 16; - else if (field->filter_type == FILTER_STATIC_STRING) - size = field->size; + if (key_field->flags & HIST_FIELD_FL_COMM) { + size = strlen((char *)key); + } else { + struct ftrace_event_field *field; + + field = key_field->field; + if (field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING) + size = *(u32 *)(rec + field->offset) >> 16; + else if (field->filter_type == FILTER_STATIC_STRING) + size = field->size; + } /* ensure NULL-termination */ if (size > key_field->size - 1) size = key_field->size - 1; - - strncpy(compound_key + key_field->offset, (char *)key, size); - } else - memcpy(compound_key + key_field->offset, key, size); + } + memcpy(compound_key + key_field->offset, key, size); } static void @@ -5246,17 +5275,94 @@ hist_trigger_actions(struct hist_trigger_data *hist_data, } } +/* + * The hist_pad structure is used to save information to create + * a histogram from the histogram trigger. It's too big to store + * on the stack, so when the histogram trigger is initialized + * a percpu array of 4 hist_pad structures is allocated. + * This will cover every context from normal, softirq, irq and NMI + * in the very unlikely event that a tigger happens at each of + * these contexts and interrupts a currently active trigger. + */ +struct hist_pad { + unsigned long entries[HIST_STACKTRACE_DEPTH]; + u64 var_ref_vals[TRACING_MAP_VARS_MAX]; + char compound_key[HIST_KEY_SIZE_MAX]; +}; + +static struct hist_pad __percpu *hist_pads; +static DEFINE_PER_CPU(int, hist_pad_cnt); +static refcount_t hist_pad_ref; + +/* One hist_pad for every context (normal, softirq, irq, NMI) */ +#define MAX_HIST_CNT 4 + +static int alloc_hist_pad(void) +{ + lockdep_assert_held(&event_mutex); + + if (refcount_read(&hist_pad_ref)) { + refcount_inc(&hist_pad_ref); + return 0; + } + + hist_pads = __alloc_percpu(sizeof(struct hist_pad) * MAX_HIST_CNT, + __alignof__(struct hist_pad)); + if (!hist_pads) + return -ENOMEM; + + refcount_set(&hist_pad_ref, 1); + return 0; +} + +static void free_hist_pad(void) +{ + lockdep_assert_held(&event_mutex); + + if (!refcount_dec_and_test(&hist_pad_ref)) + return; + + free_percpu(hist_pads); + hist_pads = NULL; +} + +static struct hist_pad *get_hist_pad(void) +{ + struct hist_pad *hist_pad; + int cnt; + + if (WARN_ON_ONCE(!hist_pads)) + return NULL; + + preempt_disable(); + + hist_pad = per_cpu_ptr(hist_pads, smp_processor_id()); + + if (this_cpu_read(hist_pad_cnt) == MAX_HIST_CNT) { + preempt_enable(); + return NULL; + } + + cnt = this_cpu_inc_return(hist_pad_cnt) - 1; + + return &hist_pad[cnt]; +} + +static void put_hist_pad(void) +{ + this_cpu_dec(hist_pad_cnt); + preempt_enable(); +} + static void event_hist_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe) { struct hist_trigger_data *hist_data = data->private_data; bool use_compound_key = (hist_data->n_keys > 1); - unsigned long entries[HIST_STACKTRACE_DEPTH]; - u64 var_ref_vals[TRACING_MAP_VARS_MAX]; - char compound_key[HIST_KEY_SIZE_MAX]; struct tracing_map_elt *elt = NULL; struct hist_field *key_field; + struct hist_pad *hist_pad; u64 field_contents; void *key = NULL; unsigned int i; @@ -5264,12 +5370,18 @@ static void event_hist_trigger(struct event_trigger_data *data, if (unlikely(!rbe)) return; - memset(compound_key, 0, hist_data->key_size); + hist_pad = get_hist_pad(); + if (!hist_pad) + return; + + memset(hist_pad->compound_key, 0, hist_data->key_size); for_each_hist_key_field(i, hist_data) { key_field = hist_data->fields[i]; if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { + unsigned long *entries = hist_pad->entries; + memset(entries, 0, HIST_STACKTRACE_SIZE); if (key_field->field) { unsigned long *stack, n_entries; @@ -5293,26 +5405,31 @@ static void event_hist_trigger(struct event_trigger_data *data, } if (use_compound_key) - add_to_key(compound_key, key, key_field, rec); + add_to_key(hist_pad->compound_key, key, key_field, rec); } if (use_compound_key) - key = compound_key; + key = hist_pad->compound_key; if (hist_data->n_var_refs && - !resolve_var_refs(hist_data, key, var_ref_vals, false)) - return; + !resolve_var_refs(hist_data, key, hist_pad->var_ref_vals, false)) + goto out; elt = tracing_map_insert(hist_data->map, key); if (!elt) - return; + goto out; - hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, var_ref_vals); + hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, hist_pad->var_ref_vals); - if (resolve_var_refs(hist_data, key, var_ref_vals, true)) - hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals); + if (resolve_var_refs(hist_data, key, hist_pad->var_ref_vals, true)) { + hist_trigger_actions(hist_data, elt, buffer, rec, rbe, + key, hist_pad->var_ref_vals); + } hist_poll_wakeup(); + + out: + put_hist_pad(); } static void hist_trigger_stacktrace_print(struct seq_file *m, @@ -6011,6 +6128,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) if (hist_field->flags & HIST_FIELD_FL_CPU) seq_puts(m, "common_cpu"); + if (hist_field->flags & HIST_FIELD_FL_COMM) + seq_puts(m, "common_comm"); else if (hist_field->flags & HIST_FIELD_FL_CONST) seq_printf(m, "%llu", hist_field->constant); else if (field_name) { @@ -6157,6 +6276,9 @@ static int event_hist_trigger_init(struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; + if (alloc_hist_pad() < 0) + return -ENOMEM; + if (!data->ref && hist_data->attrs->name) save_named_trigger(hist_data->attrs->name, data); @@ -6201,6 +6323,7 @@ static void event_hist_trigger_free(struct event_trigger_data *data) destroy_hist_data(hist_data); } + free_hist_pad(); } static const struct event_trigger_ops event_hist_trigger_ops = { @@ -6216,9 +6339,7 @@ static int event_hist_trigger_named_init(struct event_trigger_data *data) save_named_trigger(data->named_data->name, data); - event_hist_trigger_init(data->named_data); - - return 0; + return event_hist_trigger_init(data->named_data); } static void event_hist_trigger_named_free(struct event_trigger_data *data) @@ -6705,7 +6826,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, return PTR_ERR(hist_data); } - trigger_data = event_trigger_alloc(cmd_ops, cmd, param, hist_data); + trigger_data = trigger_data_alloc(cmd_ops, cmd, param, hist_data); if (!trigger_data) { ret = -ENOMEM; goto out_free; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 6e87ae2a1a66..cbfc306c0159 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -552,16 +552,14 @@ static int register_trigger(char *glob, lockdep_assert_held(&event_mutex); list_for_each_entry(test, &file->triggers, list) { - if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) { - ret = -EEXIST; - goto out; - } + if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) + return -EEXIST; } if (data->ops->init) { ret = data->ops->init(data); if (ret < 0) - goto out; + return ret; } list_add_rcu(&data->list, &file->triggers); @@ -572,7 +570,6 @@ static int register_trigger(char *glob, list_del_rcu(&data->list); update_cond_flag(file); } -out: return ret; } @@ -770,7 +767,7 @@ int event_trigger_separate_filter(char *param_and_filter, char **param, if (!param_and_filter) { if (param_required) ret = -EINVAL; - goto out; + return ret; } /* @@ -781,7 +778,7 @@ int event_trigger_separate_filter(char *param_and_filter, char **param, */ if (!param_required && param_and_filter && !isdigit(param_and_filter[0])) { *filter = param_and_filter; - goto out; + return ret; } /* @@ -799,12 +796,11 @@ int event_trigger_separate_filter(char *param_and_filter, char **param, if (!**filter) *filter = NULL; } -out: return ret; } /** - * event_trigger_alloc - allocate and init event_trigger_data for a trigger + * trigger_data_alloc - allocate and init event_trigger_data for a trigger * @cmd_ops: The event_command operations for the trigger * @cmd: The cmd string * @param: The param string @@ -815,14 +811,14 @@ out: * trigger_ops to assign to the event_trigger_data. @private_data can * also be passed in and associated with the event_trigger_data. * - * Use event_trigger_free() to free an event_trigger_data object. + * Use trigger_data_free() to free an event_trigger_data object. * * Return: The trigger_data object success, NULL otherwise */ -struct event_trigger_data *event_trigger_alloc(struct event_command *cmd_ops, - char *cmd, - char *param, - void *private_data) +struct event_trigger_data *trigger_data_alloc(struct event_command *cmd_ops, + char *cmd, + char *param, + void *private_data) { struct event_trigger_data *trigger_data; const struct event_trigger_ops *trigger_ops; @@ -989,15 +985,14 @@ event_trigger_parse(struct event_command *cmd_ops, return ret; ret = -ENOMEM; - trigger_data = event_trigger_alloc(cmd_ops, cmd, param, file); + trigger_data = trigger_data_alloc(cmd_ops, cmd, param, file); if (!trigger_data) - goto out; + return ret; if (remove) { event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); - kfree(trigger_data); - ret = 0; - goto out; + trigger_data_free(trigger_data); + return 0; } ret = event_trigger_parse_num(param, trigger_data); @@ -1017,13 +1012,12 @@ event_trigger_parse(struct event_command *cmd_ops, /* Down the counter of trigger_data or free it if not used anymore */ event_trigger_free(trigger_data); - out: return ret; out_free: event_trigger_reset_filter(cmd_ops, trigger_data); - kfree(trigger_data); - goto out; + trigger_data_free(trigger_data); + return ret; } /** @@ -1057,10 +1051,10 @@ int set_trigger_filter(char *filter_str, s = strsep(&filter_str, " \t"); if (!strlen(s) || strcmp(s, "if") != 0) - goto out; + return ret; if (!filter_str) - goto out; + return ret; /* The filter is for the 'trigger' event, not the triggered event */ ret = create_event_filter(file->tr, file->event_call, @@ -1104,7 +1098,6 @@ int set_trigger_filter(char *filter_str, ret = -ENOMEM; } } - out: return ret; } @@ -1772,7 +1765,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, ret = -EINVAL; event_enable_file = find_event_file(tr, system, event); if (!event_enable_file) - goto out; + return ret; #ifdef CONFIG_HIST_TRIGGERS hist = ((strcmp(cmd, ENABLE_HIST_STR) == 0) || @@ -1787,16 +1780,16 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL); if (!enable_data) - goto out; + return ret; enable_data->hist = hist; enable_data->enable = enable; enable_data->file = event_enable_file; - trigger_data = event_trigger_alloc(cmd_ops, cmd, param, enable_data); + trigger_data = trigger_data_alloc(cmd_ops, cmd, param, enable_data); if (!trigger_data) { kfree(enable_data); - goto out; + return ret; } if (remove) { @@ -1804,7 +1797,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, kfree(trigger_data); kfree(enable_data); ret = 0; - goto out; + return ret; } /* Up the trigger_data count to make sure nothing frees it on failure */ @@ -1834,7 +1827,6 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, goto out_disable; event_trigger_free(trigger_data); - out: return ret; out_disable: trace_event_enable_disable(event_enable_file, 0, 1); @@ -1845,7 +1837,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, event_trigger_free(trigger_data); kfree(enable_data); - goto out; + return ret; } int event_enable_register_trigger(char *glob, @@ -1865,15 +1857,14 @@ int event_enable_register_trigger(char *glob, (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) && (test_enable_data->file == enable_data->file)) { - ret = -EEXIST; - goto out; + return -EEXIST; } } if (data->ops->init) { ret = data->ops->init(data); if (ret < 0) - goto out; + return ret; } list_add_rcu(&data->list, &file->triggers); @@ -1884,7 +1875,6 @@ int event_enable_register_trigger(char *glob, list_del_rcu(&data->list); update_cond_flag(file); } -out: return ret; } diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index b40fa59159ac..b36ade43d4b3 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -4,15 +4,18 @@ * Copyright (C) 2022 Google LLC. */ #define pr_fmt(fmt) "trace_fprobe: " fmt -#include <asm/ptrace.h> #include <linux/fprobe.h> +#include <linux/list.h> #include <linux/module.h> +#include <linux/mutex.h> #include <linux/rculist.h> #include <linux/security.h> #include <linux/tracepoint.h> #include <linux/uaccess.h> +#include <asm/ptrace.h> + #include "trace_dynevent.h" #include "trace_probe.h" #include "trace_probe_kernel.h" @@ -21,7 +24,6 @@ #define FPROBE_EVENT_SYSTEM "fprobes" #define TRACEPOINT_EVENT_SYSTEM "tracepoints" #define RETHOOK_MAXACTIVE_MAX 4096 -#define TRACEPOINT_STUB ERR_PTR(-ENOENT) static int trace_fprobe_create(const char *raw_command); static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev); @@ -38,15 +40,156 @@ static struct dyn_event_operations trace_fprobe_ops = { .match = trace_fprobe_match, }; +/* List of tracepoint_user */ +static LIST_HEAD(tracepoint_user_list); +static DEFINE_MUTEX(tracepoint_user_mutex); + +/* While living tracepoint_user, @tpoint can be NULL and @refcount != 0. */ +struct tracepoint_user { + struct list_head list; + const char *name; + struct tracepoint *tpoint; + unsigned int refcount; +}; + +/* NOTE: you must lock tracepoint_user_mutex. */ +#define for_each_tracepoint_user(tuser) \ + list_for_each_entry(tuser, &tracepoint_user_list, list) + +static int tracepoint_user_register(struct tracepoint_user *tuser) +{ + struct tracepoint *tpoint = tuser->tpoint; + + if (!tpoint) + return 0; + + return tracepoint_probe_register_prio_may_exist(tpoint, + tpoint->probestub, NULL, 0); +} + +static void tracepoint_user_unregister(struct tracepoint_user *tuser) +{ + if (!tuser->tpoint) + return; + + WARN_ON_ONCE(tracepoint_probe_unregister(tuser->tpoint, tuser->tpoint->probestub, NULL)); + tuser->tpoint = NULL; +} + +static unsigned long tracepoint_user_ip(struct tracepoint_user *tuser) +{ + if (!tuser->tpoint) + return 0UL; + + return (unsigned long)tuser->tpoint->probestub; +} + +static void __tracepoint_user_free(struct tracepoint_user *tuser) +{ + if (!tuser) + return; + kfree(tuser->name); + kfree(tuser); +} + +DEFINE_FREE(tuser_free, struct tracepoint_user *, __tracepoint_user_free(_T)) + +static struct tracepoint_user *__tracepoint_user_init(const char *name, struct tracepoint *tpoint) +{ + struct tracepoint_user *tuser __free(tuser_free) = NULL; + int ret; + + tuser = kzalloc(sizeof(*tuser), GFP_KERNEL); + if (!tuser) + return NULL; + tuser->name = kstrdup(name, GFP_KERNEL); + if (!tuser->name) + return NULL; + + if (tpoint) { + ret = tracepoint_user_register(tuser); + if (ret) + return ERR_PTR(ret); + } + + tuser->tpoint = tpoint; + tuser->refcount = 1; + INIT_LIST_HEAD(&tuser->list); + list_add(&tuser->list, &tracepoint_user_list); + + return_ptr(tuser); +} + +static struct tracepoint *find_tracepoint(const char *tp_name, + struct module **tp_mod); + +/* + * Get tracepoint_user if exist, or allocate new one and register it. + * If tracepoint is on a module, get its refcounter too. + * This returns errno or NULL (not loaded yet) or tracepoint_user. + */ +static struct tracepoint_user *tracepoint_user_find_get(const char *name, struct module **pmod) +{ + struct module *mod __free(module_put) = NULL; + struct tracepoint_user *tuser; + struct tracepoint *tpoint; + + if (!name || !pmod) + return ERR_PTR(-EINVAL); + + /* Get and lock the module which has tracepoint. */ + tpoint = find_tracepoint(name, &mod); + + guard(mutex)(&tracepoint_user_mutex); + /* Search existing tracepoint_user */ + for_each_tracepoint_user(tuser) { + if (!strcmp(tuser->name, name)) { + tuser->refcount++; + *pmod = no_free_ptr(mod); + return tuser; + } + } + + /* The corresponding tracepoint_user is not found. */ + tuser = __tracepoint_user_init(name, tpoint); + if (!IS_ERR_OR_NULL(tuser)) + *pmod = no_free_ptr(mod); + + return tuser; +} + +static void tracepoint_user_put(struct tracepoint_user *tuser) +{ + scoped_guard(mutex, &tracepoint_user_mutex) { + if (--tuser->refcount > 0) + return; + + list_del(&tuser->list); + tracepoint_user_unregister(tuser); + } + + __tracepoint_user_free(tuser); +} + +DEFINE_FREE(tuser_put, struct tracepoint_user *, + if (!IS_ERR_OR_NULL(_T)) + tracepoint_user_put(_T)) + /* * Fprobe event core functions */ + +/* + * @tprobe is true for tracepoint probe. + * @tuser can be NULL if the trace_fprobe is disabled or the tracepoint is not + * loaded with a module. If @tuser != NULL, this trace_fprobe is enabled. + */ struct trace_fprobe { struct dyn_event devent; struct fprobe fp; const char *symbol; - struct tracepoint *tpoint; - struct module *mod; + bool tprobe; + struct tracepoint_user *tuser; struct trace_probe tp; }; @@ -76,7 +219,7 @@ static bool trace_fprobe_is_return(struct trace_fprobe *tf) static bool trace_fprobe_is_tracepoint(struct trace_fprobe *tf) { - return tf->tpoint != NULL; + return tf->tprobe; } static const char *trace_fprobe_symbol(struct trace_fprobe *tf) @@ -411,6 +554,8 @@ static void free_trace_fprobe(struct trace_fprobe *tf) { if (tf) { trace_probe_cleanup(&tf->tp); + if (tf->tuser) + tracepoint_user_put(tf->tuser); kfree(tf->symbol); kfree(tf); } @@ -425,9 +570,8 @@ DEFINE_FREE(free_trace_fprobe, struct trace_fprobe *, if (!IS_ERR_OR_NULL(_T)) f static struct trace_fprobe *alloc_trace_fprobe(const char *group, const char *event, const char *symbol, - struct tracepoint *tpoint, - struct module *mod, - int nargs, bool is_return) + int nargs, bool is_return, + bool is_tracepoint) { struct trace_fprobe *tf __free(free_trace_fprobe) = NULL; int ret = -ENOMEM; @@ -445,8 +589,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group, else tf->fp.entry_handler = fentry_dispatcher; - tf->tpoint = tpoint; - tf->mod = mod; + tf->tprobe = is_tracepoint; ret = trace_probe_init(&tf->tp, event, group, false, nargs); if (ret < 0) @@ -469,98 +612,6 @@ static struct trace_fprobe *find_trace_fprobe(const char *event, return NULL; } -static inline int __enable_trace_fprobe(struct trace_fprobe *tf) -{ - if (trace_fprobe_is_registered(tf)) - enable_fprobe(&tf->fp); - - return 0; -} - -static void __disable_trace_fprobe(struct trace_probe *tp) -{ - struct trace_fprobe *tf; - - list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { - if (!trace_fprobe_is_registered(tf)) - continue; - disable_fprobe(&tf->fp); - } -} - -/* - * Enable trace_probe - * if the file is NULL, enable "perf" handler, or enable "trace" handler. - */ -static int enable_trace_fprobe(struct trace_event_call *call, - struct trace_event_file *file) -{ - struct trace_probe *tp; - struct trace_fprobe *tf; - bool enabled; - int ret = 0; - - tp = trace_probe_primary_from_call(call); - if (WARN_ON_ONCE(!tp)) - return -ENODEV; - enabled = trace_probe_is_enabled(tp); - - /* This also changes "enabled" state */ - if (file) { - ret = trace_probe_add_file(tp, file); - if (ret) - return ret; - } else - trace_probe_set_flag(tp, TP_FLAG_PROFILE); - - if (!enabled) { - list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { - /* TODO: check the fprobe is gone */ - __enable_trace_fprobe(tf); - } - } - - return 0; -} - -/* - * Disable trace_probe - * if the file is NULL, disable "perf" handler, or disable "trace" handler. - */ -static int disable_trace_fprobe(struct trace_event_call *call, - struct trace_event_file *file) -{ - struct trace_probe *tp; - - tp = trace_probe_primary_from_call(call); - if (WARN_ON_ONCE(!tp)) - return -ENODEV; - - if (file) { - if (!trace_probe_get_file_link(tp, file)) - return -ENOENT; - if (!trace_probe_has_single_file(tp)) - goto out; - trace_probe_clear_flag(tp, TP_FLAG_TRACE); - } else - trace_probe_clear_flag(tp, TP_FLAG_PROFILE); - - if (!trace_probe_is_enabled(tp)) - __disable_trace_fprobe(tp); - - out: - if (file) - /* - * Synchronization is done in below function. For perf event, - * file == NULL and perf_trace_event_unreg() calls - * tracepoint_synchronize_unregister() to ensure synchronize - * event. We don't need to care about it. - */ - trace_probe_remove_file(tp, file); - - return 0; -} - /* Event entry printers */ static enum print_line_t print_fentry_event(struct trace_iterator *iter, int flags, @@ -712,20 +763,52 @@ static int unregister_fprobe_event(struct trace_fprobe *tf) static int __regsiter_tracepoint_fprobe(struct trace_fprobe *tf) { - struct tracepoint *tpoint = tf->tpoint; - unsigned long ip = (unsigned long)tpoint->probestub; + struct tracepoint_user *tuser __free(tuser_put) = NULL; + struct module *mod __free(module_put) = NULL; + unsigned long ip; int ret; + if (WARN_ON_ONCE(tf->tuser)) + return -EINVAL; + + /* If the tracepoint is in a module, it must be locked in this function. */ + tuser = tracepoint_user_find_get(tf->symbol, &mod); + /* This tracepoint is not loaded yet */ + if (IS_ERR(tuser)) + return PTR_ERR(tuser); + if (!tuser) + return -ENOMEM; + + /* Register fprobe only if the tracepoint is loaded. */ + if (tuser->tpoint) { + ip = tracepoint_user_ip(tuser); + if (WARN_ON_ONCE(!ip)) + return -ENOENT; + + ret = register_fprobe_ips(&tf->fp, &ip, 1); + if (ret < 0) + return ret; + } + + tf->tuser = no_free_ptr(tuser); + return 0; +} + +/* Returns an error if the target function is not available, or 0 */ +static int trace_fprobe_verify_target(struct trace_fprobe *tf) +{ + int ret; + + /* Tracepoint should have a stub function. */ + if (trace_fprobe_is_tracepoint(tf)) + return 0; + /* - * Here, we do 2 steps to enable fprobe on a tracepoint. - * At first, put __probestub_##TP function on the tracepoint - * and put a fprobe on the stub function. + * Note: since we don't lock the module, even if this succeeded, + * register_fprobe() later can fail. */ - ret = tracepoint_probe_register_prio_may_exist(tpoint, - tpoint->probestub, NULL, 0); - if (ret < 0) - return ret; - return register_fprobe_ips(&tf->fp, &ip, 1); + ret = fprobe_count_ips_from_filter(tf->symbol, NULL); + return (ret < 0) ? ret : 0; } /* Internal register function - just handle fprobe and flags */ @@ -747,20 +830,10 @@ static int __register_trace_fprobe(struct trace_fprobe *tf) return ret; } - /* Set/clear disabled flag according to tp->flag */ - if (trace_probe_is_enabled(&tf->tp)) - tf->fp.flags &= ~FPROBE_FL_DISABLED; - else - tf->fp.flags |= FPROBE_FL_DISABLED; - - if (trace_fprobe_is_tracepoint(tf)) { - - /* This tracepoint is not loaded yet */ - if (tf->tpoint == TRACEPOINT_STUB) - return 0; + tf->fp.flags &= ~FPROBE_FL_DISABLED; + if (trace_fprobe_is_tracepoint(tf)) return __regsiter_tracepoint_fprobe(tf); - } /* TODO: handle filter, nofilter or symbol list */ return register_fprobe(&tf->fp, tf->symbol, NULL); @@ -769,15 +842,11 @@ static int __register_trace_fprobe(struct trace_fprobe *tf) /* Internal unregister function - just handle fprobe and flags */ static void __unregister_trace_fprobe(struct trace_fprobe *tf) { - if (trace_fprobe_is_registered(tf)) { + if (trace_fprobe_is_registered(tf)) unregister_fprobe(&tf->fp); - memset(&tf->fp, 0, sizeof(tf->fp)); - if (trace_fprobe_is_tracepoint(tf)) { - tracepoint_probe_unregister(tf->tpoint, - tf->tpoint->probestub, NULL); - tf->tpoint = NULL; - tf->mod = NULL; - } + if (tf->tuser) { + tracepoint_user_put(tf->tuser); + tf->tuser = NULL; } } @@ -837,7 +906,7 @@ static bool trace_fprobe_has_same_fprobe(struct trace_fprobe *orig, return false; } -static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to) +static int append_trace_fprobe_event(struct trace_fprobe *tf, struct trace_fprobe *to) { int ret; @@ -865,7 +934,7 @@ static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to) if (ret) return ret; - ret = __register_trace_fprobe(tf); + ret = trace_fprobe_verify_target(tf); if (ret) trace_probe_unlink(&tf->tp); else @@ -874,8 +943,8 @@ static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to) return ret; } -/* Register a trace_probe and probe_event */ -static int register_trace_fprobe(struct trace_fprobe *tf) +/* Register a trace_probe and probe_event, and check the fprobe is available. */ +static int register_trace_fprobe_event(struct trace_fprobe *tf) { struct trace_fprobe *old_tf; int ret; @@ -885,7 +954,7 @@ static int register_trace_fprobe(struct trace_fprobe *tf) old_tf = find_trace_fprobe(trace_probe_name(&tf->tp), trace_probe_group_name(&tf->tp)); if (old_tf) - return append_trace_fprobe(tf, old_tf); + return append_trace_fprobe_event(tf, old_tf); /* Register new event */ ret = register_fprobe_event(tf); @@ -898,8 +967,8 @@ static int register_trace_fprobe(struct trace_fprobe *tf) return ret; } - /* Register fprobe */ - ret = __register_trace_fprobe(tf); + /* Verify fprobe is sane. */ + ret = trace_fprobe_verify_target(tf); if (ret < 0) unregister_fprobe_event(tf); else @@ -963,15 +1032,6 @@ static struct tracepoint *find_tracepoint(const char *tp_name, } #ifdef CONFIG_MODULES -static void reenable_trace_fprobe(struct trace_fprobe *tf) -{ - struct trace_probe *tp = &tf->tp; - - list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { - __enable_trace_fprobe(tf); - } -} - /* * Find a tracepoint from specified module. In this case, this does not get the * module's refcount. The caller must ensure the module is not freed. @@ -988,36 +1048,94 @@ static struct tracepoint *find_tracepoint_in_module(struct module *mod, return data.tpoint; } +/* These are CONFIG_MODULES=y specific functions. */ +static bool tracepoint_user_within_module(struct tracepoint_user *tuser, + struct module *mod) +{ + return within_module(tracepoint_user_ip(tuser), mod); +} + +static int tracepoint_user_register_again(struct tracepoint_user *tuser, + struct tracepoint *tpoint) +{ + tuser->tpoint = tpoint; + return tracepoint_user_register(tuser); +} + +static void tracepoint_user_unregister_clear(struct tracepoint_user *tuser) +{ + tracepoint_user_unregister(tuser); + tuser->tpoint = NULL; +} + +/* module callback for tracepoint_user */ static int __tracepoint_probe_module_cb(struct notifier_block *self, unsigned long val, void *data) { struct tp_module *tp_mod = data; + struct tracepoint_user *tuser; struct tracepoint *tpoint; + + if (val != MODULE_STATE_GOING && val != MODULE_STATE_COMING) + return NOTIFY_DONE; + + mutex_lock(&tracepoint_user_mutex); + for_each_tracepoint_user(tuser) { + if (val == MODULE_STATE_COMING) { + /* This is not a tracepoint in this module. Skip it. */ + tpoint = find_tracepoint_in_module(tp_mod->mod, tuser->name); + if (!tpoint) + continue; + WARN_ON_ONCE(tracepoint_user_register_again(tuser, tpoint)); + } else if (val == MODULE_STATE_GOING && + tracepoint_user_within_module(tuser, tp_mod->mod)) { + /* Unregister all tracepoint_user in this module. */ + tracepoint_user_unregister_clear(tuser); + } + } + mutex_unlock(&tracepoint_user_mutex); + + return NOTIFY_DONE; +} + +static struct notifier_block tracepoint_module_nb = { + .notifier_call = __tracepoint_probe_module_cb, +}; + +/* module callback for tprobe events */ +static int __tprobe_event_module_cb(struct notifier_block *self, + unsigned long val, void *data) +{ struct trace_fprobe *tf; struct dyn_event *pos; + struct module *mod = data; if (val != MODULE_STATE_GOING && val != MODULE_STATE_COMING) return NOTIFY_DONE; mutex_lock(&event_mutex); for_each_trace_fprobe(tf, pos) { - if (val == MODULE_STATE_COMING && tf->tpoint == TRACEPOINT_STUB) { - tpoint = find_tracepoint_in_module(tp_mod->mod, tf->symbol); - if (tpoint) { - tf->tpoint = tpoint; - tf->mod = tp_mod->mod; - if (!WARN_ON_ONCE(__regsiter_tracepoint_fprobe(tf)) && - trace_probe_is_enabled(&tf->tp)) - reenable_trace_fprobe(tf); - } - } else if (val == MODULE_STATE_GOING && tp_mod->mod == tf->mod) { + /* Skip fprobe and disabled tprobe events. */ + if (!trace_fprobe_is_tracepoint(tf) || !tf->tuser) + continue; + + /* Before this notification, tracepoint notifier has already done. */ + if (val == MODULE_STATE_COMING && + tracepoint_user_within_module(tf->tuser, mod)) { + unsigned long ip = tracepoint_user_ip(tf->tuser); + + WARN_ON_ONCE(register_fprobe_ips(&tf->fp, &ip, 1)); + } else if (val == MODULE_STATE_GOING && + /* + * tracepoint_user_within_module() does not work here because + * tracepoint_user is already unregistered and cleared tpoint. + * Instead, checking whether the fprobe is registered but + * tpoint is cleared(unregistered). Such unbalance probes + * must be adjusted anyway. + */ + trace_fprobe_is_registered(tf) && + !tf->tuser->tpoint) { unregister_fprobe(&tf->fp); - if (trace_fprobe_is_tracepoint(tf)) { - tracepoint_probe_unregister(tf->tpoint, - tf->tpoint->probestub, NULL); - tf->tpoint = TRACEPOINT_STUB; - tf->mod = NULL; - } } } mutex_unlock(&event_mutex); @@ -1025,8 +1143,11 @@ static int __tracepoint_probe_module_cb(struct notifier_block *self, return NOTIFY_DONE; } -static struct notifier_block tracepoint_module_nb = { - .notifier_call = __tracepoint_probe_module_cb, +/* NOTE: this must be called after tracepoint callback */ +static struct notifier_block tprobe_event_module_nb = { + .notifier_call = __tprobe_event_module_cb, + /* Make sure this is later than tracepoint module notifier. */ + .priority = -10, }; #endif /* CONFIG_MODULES */ @@ -1086,8 +1207,6 @@ static int parse_symbol_and_return(int argc, const char *argv[], return 0; } -DEFINE_FREE(module_put, struct module *, if (_T) module_put(_T)) - static int trace_fprobe_create_internal(int argc, const char *argv[], struct traceprobe_parse_context *ctx) { @@ -1116,19 +1235,18 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], * FETCHARG:TYPE : use TYPE instead of unsigned long. */ struct trace_fprobe *tf __free(free_trace_fprobe) = NULL; - int i, new_argc = 0, ret = 0; - bool is_return = false; - char *symbol __free(kfree) = NULL; const char *event = NULL, *group = FPROBE_EVENT_SYSTEM; + struct module *mod __free(module_put) = NULL; const char **new_argv __free(kfree) = NULL; - char buf[MAX_EVENT_NAME_LEN]; - char gbuf[MAX_EVENT_NAME_LEN]; - char sbuf[KSYM_NAME_LEN]; - char abuf[MAX_BTF_ARGS_LEN]; + char *symbol __free(kfree) = NULL; + char *ebuf __free(kfree) = NULL; + char *gbuf __free(kfree) = NULL; + char *sbuf __free(kfree) = NULL; + char *abuf __free(kfree) = NULL; char *dbuf __free(kfree) = NULL; + int i, new_argc = 0, ret = 0; bool is_tracepoint = false; - struct module *tp_mod __free(module_put) = NULL; - struct tracepoint *tpoint = NULL; + bool is_return = false; if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2) return -ECANCELED; @@ -1156,6 +1274,9 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], trace_probe_log_set_index(0); if (event) { + gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!gbuf) + return -ENOMEM; ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) @@ -1163,15 +1284,18 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], } if (!event) { + ebuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!ebuf) + return -ENOMEM; /* Make a new event name */ if (is_tracepoint) - snprintf(buf, MAX_EVENT_NAME_LEN, "%s%s", + snprintf(ebuf, MAX_EVENT_NAME_LEN, "%s%s", isdigit(*symbol) ? "_" : "", symbol); else - snprintf(buf, MAX_EVENT_NAME_LEN, "%s__%s", symbol, + snprintf(ebuf, MAX_EVENT_NAME_LEN, "%s__%s", symbol, is_return ? "exit" : "entry"); - sanitize_event_name(buf); - event = buf; + sanitize_event_name(ebuf); + event = ebuf; } if (is_return) @@ -1179,25 +1303,28 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], else ctx->flags |= TPARG_FL_FENTRY; + ctx->funcname = NULL; if (is_tracepoint) { + /* Get tracepoint and lock its module until the end of the registration. */ + struct tracepoint *tpoint; + ctx->flags |= TPARG_FL_TPOINT; - tpoint = find_tracepoint(symbol, &tp_mod); + mod = NULL; + tpoint = find_tracepoint(symbol, &mod); if (tpoint) { - ctx->funcname = kallsyms_lookup( - (unsigned long)tpoint->probestub, - NULL, NULL, NULL, sbuf); - } else if (IS_ENABLED(CONFIG_MODULES)) { - /* This *may* be loaded afterwards */ - tpoint = TRACEPOINT_STUB; - ctx->funcname = symbol; - } else { - trace_probe_log_set_index(1); - trace_probe_log_err(0, NO_TRACEPOINT); - return -EINVAL; + sbuf = kmalloc(KSYM_NAME_LEN, GFP_KERNEL); + if (!sbuf) + return -ENOMEM; + ctx->funcname = kallsyms_lookup((unsigned long)tpoint->probestub, + NULL, NULL, NULL, sbuf); } - } else + } + if (!ctx->funcname) ctx->funcname = symbol; + abuf = kmalloc(MAX_BTF_ARGS_LEN, GFP_KERNEL); + if (!abuf) + return -ENOMEM; argc -= 2; argv += 2; new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, abuf, MAX_BTF_ARGS_LEN, ctx); @@ -1218,8 +1345,7 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], return ret; /* setup a probe */ - tf = alloc_trace_fprobe(group, event, symbol, tpoint, tp_mod, - argc, is_return); + tf = alloc_trace_fprobe(group, event, symbol, argc, is_return, is_tracepoint); if (IS_ERR(tf)) { ret = PTR_ERR(tf); /* This must return -ENOMEM, else there is a bug */ @@ -1251,7 +1377,7 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], if (ret < 0) return ret; - ret = register_trace_fprobe(tf); + ret = register_trace_fprobe_event(tf); if (ret) { trace_probe_log_set_index(1); if (ret == -EILSEQ) @@ -1271,14 +1397,17 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], static int trace_fprobe_create_cb(int argc, const char *argv[]) { - struct traceprobe_parse_context ctx = { - .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE, - }; + struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL; int ret; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE; + trace_probe_log_init("trace_fprobe", argc, argv); - ret = trace_fprobe_create_internal(argc, argv, &ctx); - traceprobe_finish_parse(&ctx); + ret = trace_fprobe_create_internal(argc, argv, ctx); trace_probe_log_clear(); return ret; } @@ -1321,6 +1450,84 @@ static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev) } /* + * Enable trace_probe + * if the file is NULL, enable "perf" handler, or enable "trace" handler. + */ +static int enable_trace_fprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *tp; + struct trace_fprobe *tf; + bool enabled; + int ret = 0; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + enabled = trace_probe_is_enabled(tp); + + /* This also changes "enabled" state */ + if (file) { + ret = trace_probe_add_file(tp, file); + if (ret) + return ret; + } else + trace_probe_set_flag(tp, TP_FLAG_PROFILE); + + if (!enabled) { + list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { + ret = __register_trace_fprobe(tf); + if (ret < 0) + return ret; + } + } + + return 0; +} + +/* + * Disable trace_probe + * if the file is NULL, disable "perf" handler, or disable "trace" handler. + */ +static int disable_trace_fprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_fprobe *tf; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + + if (file) { + if (!trace_probe_get_file_link(tp, file)) + return -ENOENT; + if (!trace_probe_has_single_file(tp)) + goto out; + trace_probe_clear_flag(tp, TP_FLAG_TRACE); + } else + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + + if (!trace_probe_is_enabled(tp)) { + list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { + unregister_fprobe(&tf->fp); + } + } + + out: + if (file) + /* + * Synchronization is done in below function. For perf event, + * file == NULL and perf_trace_event_unreg() calls + * tracepoint_synchronize_unregister() to ensure synchronize + * event. We don't need to care about it. + */ + trace_probe_remove_file(tp, file); + + return 0; +} + +/* * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex. */ static int fprobe_register(struct trace_event_call *event, @@ -1365,6 +1572,9 @@ static __init int init_fprobe_trace_early(void) ret = register_tracepoint_module_notifier(&tracepoint_module_nb); if (ret) return ret; + ret = register_module_notifier(&tprobe_event_module_nb); + if (ret) + return ret; #endif return 0; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 4e37a0f6aaa3..d17c18934445 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -209,7 +209,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { struct trace_array *tr = op->private; - struct trace_array_cpu *data; unsigned int trace_ctx; int bit; @@ -224,9 +223,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, trace_ctx = tracing_gen_ctx_dec(); - data = this_cpu_ptr(tr->array_buffer.data); - if (!atomic_read(&data->disabled)) - trace_function(tr, ip, parent_ip, trace_ctx, NULL); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); ftrace_test_recursion_unlock(bit); } @@ -236,10 +233,8 @@ function_args_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { struct trace_array *tr = op->private; - struct trace_array_cpu *data; unsigned int trace_ctx; int bit; - int cpu; if (unlikely(!tr->function_enabled)) return; @@ -250,10 +245,7 @@ function_args_trace_call(unsigned long ip, unsigned long parent_ip, trace_ctx = tracing_gen_ctx(); - cpu = smp_processor_id(); - data = per_cpu_ptr(tr->array_buffer.data, cpu); - if (!atomic_read(&data->disabled)) - trace_function(tr, ip, parent_ip, trace_ctx, fregs); + trace_function(tr, ip, parent_ip, trace_ctx, fregs); ftrace_test_recursion_unlock(bit); } @@ -299,7 +291,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, parent_ip = function_get_true_parent_ip(parent_ip, fregs); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&data->disabled); + disabled = local_inc_return(&data->disabled); if (likely(disabled == 1)) { trace_ctx = tracing_gen_ctx_flags(flags); @@ -311,7 +303,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, __trace_stack(tr, trace_ctx, skip); } - atomic_dec(&data->disabled); + local_dec(&data->disabled); local_irq_restore(flags); } @@ -352,7 +344,6 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, { struct trace_func_repeats *last_info; struct trace_array *tr = op->private; - struct trace_array_cpu *data; unsigned int trace_ctx; int bit; @@ -364,8 +355,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, return; parent_ip = function_get_true_parent_ip(parent_ip, fregs); - data = this_cpu_ptr(tr->array_buffer.data); - if (atomic_read(&data->disabled)) + if (!tracer_tracing_is_on(tr)) goto out; /* @@ -412,7 +402,7 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, parent_ip = function_get_true_parent_ip(parent_ip, fregs); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&data->disabled); + disabled = local_inc_return(&data->disabled); if (likely(disabled == 1)) { last_info = per_cpu_ptr(tr->last_func_repeats, cpu); @@ -427,7 +417,7 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, } out: - atomic_dec(&data->disabled); + local_dec(&data->disabled); local_irq_restore(flags); } diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 0c357a89c58e..66e1a527cf1a 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -202,12 +202,9 @@ static int graph_entry(struct ftrace_graph_ent *trace, { unsigned long *task_var = fgraph_get_task_var(gops); struct trace_array *tr = gops->private; - struct trace_array_cpu *data; struct fgraph_times *ftimes; unsigned int trace_ctx; - long disabled; int ret = 0; - int cpu; if (*task_var & TRACE_GRAPH_NOTRACE) return 0; @@ -257,21 +254,14 @@ static int graph_entry(struct ftrace_graph_ent *trace, if (tracing_thresh) return 1; - preempt_disable_notrace(); - cpu = raw_smp_processor_id(); - data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_read(&data->disabled); - if (likely(!disabled)) { - trace_ctx = tracing_gen_ctx(); - if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && - tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR)) { - unsigned long retaddr = ftrace_graph_top_ret_addr(current); - ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr); - } else { - ret = __graph_entry(tr, trace, trace_ctx, fregs); - } + trace_ctx = tracing_gen_ctx(); + if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && + tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR)) { + unsigned long retaddr = ftrace_graph_top_ret_addr(current); + ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr); + } else { + ret = __graph_entry(tr, trace, trace_ctx, fregs); } - preempt_enable_notrace(); return ret; } @@ -351,13 +341,10 @@ void trace_graph_return(struct ftrace_graph_ret *trace, { unsigned long *task_var = fgraph_get_task_var(gops); struct trace_array *tr = gops->private; - struct trace_array_cpu *data; struct fgraph_times *ftimes; unsigned int trace_ctx; u64 calltime, rettime; - long disabled; int size; - int cpu; rettime = trace_clock_local(); @@ -376,15 +363,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace, calltime = ftimes->calltime; - preempt_disable_notrace(); - cpu = raw_smp_processor_id(); - data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_read(&data->disabled); - if (likely(!disabled)) { - trace_ctx = tracing_gen_ctx(); - __trace_graph_return(tr, trace, trace_ctx, calltime, rettime); - } - preempt_enable_notrace(); + trace_ctx = tracing_gen_ctx(); + __trace_graph_return(tr, trace, trace_ctx, calltime, rettime); } static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, @@ -475,10 +455,16 @@ static int graph_trace_init(struct trace_array *tr) return 0; } +static struct tracer graph_trace; + static int ftrace_graph_trace_args(struct trace_array *tr, int set) { trace_func_graph_ent_t entry; + /* Do nothing if the current tracer is not this tracer */ + if (tr->current_trace != &graph_trace) + return 0; + if (set) entry = trace_graph_entry_args; else @@ -527,7 +513,7 @@ static void print_graph_proc(struct trace_seq *s, pid_t pid) { char comm[TASK_COMM_LEN]; /* sign + log10(MAX_INT) + '\0' */ - char pid_str[11]; + char pid_str[12]; int spaces = 0; int len; int i; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 40c39e946940..5496758b6c76 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -123,12 +123,12 @@ static int func_prolog_dec(struct trace_array *tr, return 0; *data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&(*data)->disabled); + disabled = local_inc_return(&(*data)->disabled); if (likely(disabled == 1)) return 1; - atomic_dec(&(*data)->disabled); + local_dec(&(*data)->disabled); return 0; } @@ -152,7 +152,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, trace_function(tr, ip, parent_ip, trace_ctx, fregs); - atomic_dec(&data->disabled); + local_dec(&data->disabled); } #endif /* CONFIG_FUNCTION_TRACER */ @@ -209,7 +209,7 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace, trace_ctx = tracing_gen_ctx_flags(flags); ret = __trace_graph_entry(tr, trace, trace_ctx); - atomic_dec(&data->disabled); + local_dec(&data->disabled); return ret; } @@ -238,7 +238,7 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace, trace_ctx = tracing_gen_ctx_flags(flags); __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); - atomic_dec(&data->disabled); + local_dec(&data->disabled); } static struct fgraph_ops fgraph_ops = { @@ -397,6 +397,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) int cpu; struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; + long disabled; if (!tracer_enabled || !tracing_is_enabled()) return; @@ -408,20 +409,22 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) data = per_cpu_ptr(tr->array_buffer.data, cpu); - if (unlikely(!data) || atomic_read(&data->disabled)) + if (unlikely(!data) || local_read(&data->disabled)) return; - atomic_inc(&data->disabled); + disabled = local_inc_return(&data->disabled); - data->critical_sequence = max_sequence; - data->preempt_timestamp = ftrace_now(cpu); - data->critical_start = parent_ip ? : ip; + if (disabled == 1) { + data->critical_sequence = max_sequence; + data->preempt_timestamp = ftrace_now(cpu); + data->critical_start = parent_ip ? : ip; - __trace_function(tr, ip, parent_ip, tracing_gen_ctx()); + __trace_function(tr, ip, parent_ip, tracing_gen_ctx()); - per_cpu(tracing_cpu, cpu) = 1; + per_cpu(tracing_cpu, cpu) = 1; + } - atomic_dec(&data->disabled); + local_dec(&data->disabled); } static nokprobe_inline void @@ -431,6 +434,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned int trace_ctx; + long disabled; cpu = raw_smp_processor_id(); /* Always clear the tracing cpu on stopping the trace */ @@ -445,16 +449,19 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) data = per_cpu_ptr(tr->array_buffer.data, cpu); if (unlikely(!data) || - !data->critical_start || atomic_read(&data->disabled)) + !data->critical_start || local_read(&data->disabled)) return; - atomic_inc(&data->disabled); + disabled = local_inc_return(&data->disabled); - trace_ctx = tracing_gen_ctx(); - __trace_function(tr, ip, parent_ip, trace_ctx); - check_critical_timing(tr, data, parent_ip ? : ip, cpu); - data->critical_start = 0; - atomic_dec(&data->disabled); + if (disabled == 1) { + trace_ctx = tracing_gen_ctx(); + __trace_function(tr, ip, parent_ip, trace_ctx); + check_critical_timing(tr, data, parent_ip ? : ip, cpu); + data->critical_start = 0; + } + + local_dec(&data->disabled); } /* start and stop critical timings used to for stoppage (in idle) */ diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 1e72d20b3c2f..896ff78b8349 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -43,17 +43,15 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file) if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter.buffer_iter[cpu] = - ring_buffer_read_prepare(iter.array_buffer->buffer, - cpu, GFP_ATOMIC); - ring_buffer_read_start(iter.buffer_iter[cpu]); + ring_buffer_read_start(iter.array_buffer->buffer, + cpu, GFP_ATOMIC); tracing_iter_reset(&iter, cpu); } } else { iter.cpu_file = cpu_file; iter.buffer_iter[cpu_file] = - ring_buffer_read_prepare(iter.array_buffer->buffer, + ring_buffer_read_start(iter.array_buffer->buffer, cpu_file, GFP_ATOMIC); - ring_buffer_read_start(iter.buffer_iter[cpu_file]); tracing_iter_reset(&iter, cpu_file); } @@ -98,7 +96,6 @@ static int kdb_ftdump(int argc, const char **argv) long cpu_file; int err; int cnt; - int cpu; if (argc > 2) return KDB_ARGCOUNT; @@ -120,9 +117,7 @@ static int kdb_ftdump(int argc, const char **argv) trace_init_global_iter(&iter); iter.buffer_iter = buffer_iter; - for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); - } + tracer_tracing_disable(iter.tr); /* A negative skip_entries means skip all but the last entries */ if (skip_entries < 0) { @@ -135,9 +130,7 @@ static int kdb_ftdump(int argc, const char **argv) ftrace_dump_buf(skip_entries, cpu_file); - for_each_tracing_cpu(cpu) { - atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); - } + tracer_tracing_enable(iter.tr); kdb_trap_printk--; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3e5c47b6d7b2..ccae62d4fb91 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -9,19 +9,19 @@ #include <linux/bpf-cgroup.h> #include <linux/cleanup.h> -#include <linux/security.h> +#include <linux/error-injection.h> #include <linux/module.h> -#include <linux/uaccess.h> #include <linux/rculist.h> -#include <linux/error-injection.h> +#include <linux/security.h> +#include <linux/uaccess.h> #include <asm/setup.h> /* for COMMAND_LINE_SIZE */ #include "trace_dynevent.h" #include "trace_kprobe_selftest.h" #include "trace_probe.h" -#include "trace_probe_tmpl.h" #include "trace_probe_kernel.h" +#include "trace_probe_tmpl.h" #define KPROBE_EVENT_SYSTEM "kprobes" #define KRETPROBE_MAXACTIVE_MAX 4096 @@ -861,20 +861,20 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], * FETCHARG:TYPE : use TYPE instead of unsigned long. */ struct trace_kprobe *tk __free(free_trace_kprobe) = NULL; + const char *event = NULL, *group = KPROBE_EVENT_SYSTEM; + const char **new_argv __free(kfree) = NULL; int i, len, new_argc = 0, ret = 0; - bool is_return = false; char *symbol __free(kfree) = NULL; - char *tmp = NULL; - const char **new_argv __free(kfree) = NULL; - const char *event = NULL, *group = KPROBE_EVENT_SYSTEM; + char *ebuf __free(kfree) = NULL; + char *gbuf __free(kfree) = NULL; + char *abuf __free(kfree) = NULL; + char *dbuf __free(kfree) = NULL; enum probe_print_type ptype; + bool is_return = false; int maxactive = 0; - long offset = 0; void *addr = NULL; - char buf[MAX_EVENT_NAME_LEN]; - char gbuf[MAX_EVENT_NAME_LEN]; - char abuf[MAX_BTF_ARGS_LEN]; - char *dbuf __free(kfree) = NULL; + char *tmp = NULL; + long offset = 0; switch (argv[0][0]) { case 'r': @@ -893,6 +893,8 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], event++; if (isdigit(argv[0][1])) { + char *buf __free(kfree) = NULL; + if (!is_return) { trace_probe_log_err(1, BAD_MAXACT_TYPE); return -EINVAL; @@ -905,7 +907,7 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], trace_probe_log_err(1, BAD_MAXACT); return -EINVAL; } - memcpy(buf, &argv[0][1], len); + buf = kmemdup(&argv[0][1], len + 1, GFP_KERNEL); buf[len] = '\0'; ret = kstrtouint(buf, 0, &maxactive); if (ret || !maxactive) { @@ -973,6 +975,9 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], trace_probe_log_set_index(0); if (event) { + gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!gbuf) + return -ENOMEM; ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) @@ -981,16 +986,22 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], if (!event) { /* Make a new event name */ + ebuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!ebuf) + return -ENOMEM; if (symbol) - snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld", + snprintf(ebuf, MAX_EVENT_NAME_LEN, "%c_%s_%ld", is_return ? 'r' : 'p', symbol, offset); else - snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p", + snprintf(ebuf, MAX_EVENT_NAME_LEN, "%c_0x%p", is_return ? 'r' : 'p', addr); - sanitize_event_name(buf); - event = buf; + sanitize_event_name(ebuf); + event = ebuf; } + abuf = kmalloc(MAX_BTF_ARGS_LEN, GFP_KERNEL); + if (!abuf) + return -ENOMEM; argc -= 2; argv += 2; ctx->funcname = symbol; new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, @@ -1065,14 +1076,18 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], static int trace_kprobe_create_cb(int argc, const char *argv[]) { - struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL }; + struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL; int ret; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + ctx->flags = TPARG_FL_KERNEL; + trace_probe_log_init("trace_kprobe", argc, argv); - ret = trace_kprobe_create_internal(argc, argv, &ctx); + ret = trace_kprobe_create_internal(argc, argv, ctx); - traceprobe_finish_parse(&ctx); trace_probe_log_clear(); return ret; } diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index ba5858866b2f..c706544be60c 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -291,7 +291,6 @@ __init static int init_mmio_trace(void) device_initcall(init_mmio_trace); static void __trace_mmiotrace_rw(struct trace_array *tr, - struct trace_array_cpu *data, struct mmiotrace_rw *rw) { struct trace_buffer *buffer = tr->array_buffer.buffer; @@ -315,12 +314,10 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, void mmio_trace_rw(struct mmiotrace_rw *rw) { struct trace_array *tr = mmio_trace_array; - struct trace_array_cpu *data = per_cpu_ptr(tr->array_buffer.data, smp_processor_id()); - __trace_mmiotrace_rw(tr, data, rw); + __trace_mmiotrace_rw(tr, rw); } static void __trace_mmiotrace_map(struct trace_array *tr, - struct trace_array_cpu *data, struct mmiotrace_map *map) { struct trace_buffer *buffer = tr->array_buffer.buffer; @@ -344,12 +341,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, void mmio_trace_mapping(struct mmiotrace_map *map) { struct trace_array *tr = mmio_trace_array; - struct trace_array_cpu *data; - - preempt_disable(); - data = per_cpu_ptr(tr->array_buffer.data, smp_processor_id()); - __trace_mmiotrace_map(tr, data, map); - preempt_enable(); + __trace_mmiotrace_map(tr, map); } int mmio_trace_printk(const char *fmt, va_list args) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index e732c9e37e14..fd259da0aa64 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -637,8 +637,8 @@ __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, u entry = ring_buffer_event_data(event); - memcpy(&entry->caller, fstack->calls, size); entry->size = fstack->nr_entries; + memcpy(&entry->caller, fstack->calls, size); trace_buffer_unlock_commit_nostack(buffer, event); } @@ -2302,7 +2302,7 @@ osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count, * osnoise_cpus_write - Write function for "cpus" entry * @filp: The active open file structure * @ubuf: The user buffer that contains the value to write - * @cnt: The maximum number of bytes to write to "file" + * @count: The maximum number of bytes to write to "file" * @ppos: The current position in @file * * This function provides a write implementation for the "cpus" @@ -2320,10 +2320,11 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, { cpumask_var_t osnoise_cpumask_new; int running, err; - char buf[256]; + char *buf __free(kfree) = NULL; - if (count >= 256) - return -EINVAL; + buf = kmalloc(count, GFP_KERNEL); + if (!buf) + return -ENOMEM; if (copy_from_user(buf, ubuf, count)) return -EFAULT; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index b9ab06c99543..0b3db02030a7 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -938,6 +938,9 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c struct list_head *head) { struct ftrace_event_field *field; + struct trace_array *tr = iter->tr; + unsigned long long laddr; + unsigned long addr; int offset; int len; int ret; @@ -974,8 +977,8 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c case FILTER_PTR_STRING: if (!iter->fmt_size) trace_iter_expand_format(iter); - pos = *(void **)pos; - ret = strncpy_from_kernel_nofault(iter->fmt, pos, + addr = trace_adjust_address(tr, *(unsigned long *)pos); + ret = strncpy_from_kernel_nofault(iter->fmt, (void *)addr, iter->fmt_size); if (ret < 0) trace_seq_printf(&iter->seq, "(0x%px)", pos); @@ -984,8 +987,8 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c pos, iter->fmt); break; case FILTER_TRACE_FN: - pos = *(void **)pos; - trace_seq_printf(&iter->seq, "%pS", pos); + addr = trace_adjust_address(tr, *(unsigned long *)pos); + trace_seq_printf(&iter->seq, "%pS", (void *)addr); break; case FILTER_CPU: case FILTER_OTHER: @@ -1015,14 +1018,36 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c break; } - trace_seq_printf(&iter->seq, "0x%x (%d)", - *(unsigned int *)pos, - *(unsigned int *)pos); + addr = *(unsigned int *)pos; + + /* Some fields reference offset from _stext. */ + if (!strcmp(field->name, "caller_offs") || + !strcmp(field->name, "parent_offs")) { + unsigned long ip; + + ip = addr + (unsigned long)_stext; + ip = trace_adjust_address(tr, ip); + trace_seq_printf(&iter->seq, "%pS ", (void *)ip); + } + + if (sizeof(long) == 4) { + addr = trace_adjust_address(tr, addr); + trace_seq_printf(&iter->seq, "%pS (%d)", + (void *)addr, (int)addr); + } else { + trace_seq_printf(&iter->seq, "0x%x (%d)", + (unsigned int)addr, (int)addr); + } break; case 8: - trace_seq_printf(&iter->seq, "0x%llx (%lld)", - *(unsigned long long *)pos, - *(unsigned long long *)pos); + laddr = *(unsigned long long *)pos; + if (sizeof(long) == 8) { + laddr = trace_adjust_address(tr, (unsigned long)laddr); + trace_seq_printf(&iter->seq, "%pS (%lld)", + (void *)(long)laddr, laddr); + } else { + trace_seq_printf(&iter->seq, "0x%llx (%lld)", laddr, laddr); + } break; default: trace_seq_puts(&iter->seq, "<INVALID-SIZE>"); @@ -1086,11 +1111,11 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, } static void print_fn_trace(struct trace_seq *s, unsigned long ip, - unsigned long parent_ip, long delta, - unsigned long *args, int flags) + unsigned long parent_ip, unsigned long *args, + struct trace_array *tr, int flags) { - ip += delta; - parent_ip += delta; + ip = trace_adjust_address(tr, ip); + parent_ip = trace_adjust_address(tr, parent_ip); seq_print_ip_sym(s, ip, flags); if (args) @@ -1119,8 +1144,7 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, else args = NULL; - print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, - args, flags); + print_fn_trace(s, field->ip, field->parent_ip, args, iter->tr, flags); trace_seq_putc(s, '\n'); return trace_handle_return(s); @@ -1706,7 +1730,7 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - ip = field->ip + iter->tr->text_delta; + ip = trace_adjust_address(iter->tr, field->ip); seq_print_ip_sym(s, ip, flags); trace_seq_printf(s, ": %s", field->buf); @@ -1792,7 +1816,7 @@ trace_func_repeats_print(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, NULL, flags); + print_fn_trace(s, field->ip, field->parent_ip, NULL, iter->tr, flags); trace_seq_printf(s, " (repeats: %u, last_ts:", field->count); trace_print_time(s, iter, iter->ts - FUNC_REPEATS_GET_DELTA_TS(field)); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 424751cdf31f..5cbdc423afeb 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -13,8 +13,8 @@ #include <linux/bpf.h> #include <linux/fs.h> -#include "trace_btf.h" +#include "trace_btf.h" #include "trace_probe.h" #undef C @@ -247,7 +247,25 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset) return 0; } -/* @buf must has MAX_EVENT_NAME_LEN size */ +/** + * traceprobe_parse_event_name() - Parse a string into group and event names + * @pevent: A pointer to the string to be parsed. + * @pgroup: A pointer to the group name. + * @buf: A buffer to store the parsed group name. + * @offset: The offset of the string in the original user command, for logging. + * + * This parses a string with the format `[GROUP/][EVENT]` or `[GROUP.][EVENT]` + * (either GROUP or EVENT or both must be specified). + * Since the parsed group name is stored in @buf, the caller must ensure @buf + * is at least MAX_EVENT_NAME_LEN bytes. + * + * Return: 0 on success, or -EINVAL on failure. + * + * If success, *@pevent is updated to point to the event name part of the + * original string, or NULL if there is no event name. + * Also, *@pgroup is updated to point to the parsed group which is stored + * in @buf, or NULL if there is no group name. + */ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, char *buf, int offset) { @@ -657,7 +675,7 @@ static int parse_btf_arg(char *varname, ret = query_btf_context(ctx); if (ret < 0 || ctx->nr_params == 0) { trace_probe_log_err(ctx->offset, NO_BTF_ENTRY); - return PTR_ERR(params); + return -ENOENT; } } params = ctx->params; @@ -779,6 +797,36 @@ static int check_prepare_btf_string_fetch(char *typename, #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API +static void store_entry_arg_at(struct fetch_insn *code, int argnum, int offset) +{ + code[0].op = FETCH_OP_ARG; + code[0].param = argnum; + code[1].op = FETCH_OP_ST_EDATA; + code[1].offset = offset; +} + +static int get_entry_arg_max_offset(struct probe_entry_arg *earg) +{ + int i, max_offset = 0; + + /* + * earg->code[] array has an operation sequence which is run in + * the entry handler. + * The sequence stopped by FETCH_OP_END and each data stored in + * the entry data buffer by FETCH_OP_ST_EDATA. The FETCH_OP_ST_EDATA + * stores the data at the data buffer + its offset, and all data are + * "unsigned long" size. The offset must be increased when a data is + * stored. Thus we need to find the last FETCH_OP_ST_EDATA in the + * code array. + */ + for (i = 0; i < earg->size - 1 && earg->code[i].op != FETCH_OP_END; i++) { + if (earg->code[i].op == FETCH_OP_ST_EDATA) + if (earg->code[i].offset > max_offset) + max_offset = earg->code[i].offset; + } + return max_offset; +} + /* * Add the entry code to store the 'argnum'th parameter and return the offset * in the entry data buffer where the data will be stored. @@ -786,8 +834,7 @@ static int check_prepare_btf_string_fetch(char *typename, static int __store_entry_arg(struct trace_probe *tp, int argnum) { struct probe_entry_arg *earg = tp->entry_arg; - bool match = false; - int i, offset; + int i, offset, last_offset = 0; if (!earg) { earg = kzalloc(sizeof(*tp->entry_arg), GFP_KERNEL); @@ -804,78 +851,59 @@ static int __store_entry_arg(struct trace_probe *tp, int argnum) for (i = 0; i < earg->size; i++) earg->code[i].op = FETCH_OP_END; tp->entry_arg = earg; + store_entry_arg_at(earg->code, argnum, 0); + return 0; } /* - * The entry code array is repeating the pair of - * [FETCH_OP_ARG(argnum)][FETCH_OP_ST_EDATA(offset of entry data buffer)] - * and the rest of entries are filled with [FETCH_OP_END]. + * NOTE: if anyone change the following rule, please rewrite this. + * The entry code array is filled with the pair of * - * To reduce the redundant function parameter fetching, we scan the entry - * code array to find the FETCH_OP_ARG which already fetches the 'argnum' - * parameter. If it doesn't match, update 'offset' to find the last - * offset. - * If we find the FETCH_OP_END without matching FETCH_OP_ARG entry, we - * will save the entry with FETCH_OP_ARG and FETCH_OP_ST_EDATA, and - * return data offset so that caller can find the data offset in the entry - * data buffer. + * [FETCH_OP_ARG(argnum)] + * [FETCH_OP_ST_EDATA(offset of entry data buffer)] + * + * and the rest of entries are filled with [FETCH_OP_END]. + * The offset should be incremented, thus the last pair should + * have the largest offset. */ - offset = 0; - for (i = 0; i < earg->size - 1; i++) { - switch (earg->code[i].op) { - case FETCH_OP_END: - earg->code[i].op = FETCH_OP_ARG; - earg->code[i].param = argnum; - earg->code[i + 1].op = FETCH_OP_ST_EDATA; - earg->code[i + 1].offset = offset; - return offset; - case FETCH_OP_ARG: - match = (earg->code[i].param == argnum); - break; - case FETCH_OP_ST_EDATA: - offset = earg->code[i].offset; - if (match) - return offset; - offset += sizeof(unsigned long); - break; - default: - break; - } + + /* Search the offset for the sprcified argnum. */ + for (i = 0; i < earg->size - 1 && earg->code[i].op != FETCH_OP_END; i += 2) { + if (WARN_ON_ONCE(earg->code[i].op != FETCH_OP_ARG)) + return -EINVAL; + + if (earg->code[i].param != argnum) + continue; + + if (WARN_ON_ONCE(earg->code[i + 1].op != FETCH_OP_ST_EDATA)) + return -EINVAL; + + return earg->code[i + 1].offset; + } + /* Not found, append new entry if possible. */ + if (i >= earg->size - 1) + return -ENOSPC; + + /* The last entry must have the largest offset. */ + if (i != 0) { + if (WARN_ON_ONCE(earg->code[i - 1].op != FETCH_OP_ST_EDATA)) + return -EINVAL; + last_offset = earg->code[i - 1].offset; } - return -ENOSPC; + + offset = last_offset + sizeof(unsigned long); + store_entry_arg_at(&earg->code[i], argnum, offset); + return offset; } int traceprobe_get_entry_data_size(struct trace_probe *tp) { struct probe_entry_arg *earg = tp->entry_arg; - int i, size = 0; if (!earg) return 0; - /* - * earg->code[] array has an operation sequence which is run in - * the entry handler. - * The sequence stopped by FETCH_OP_END and each data stored in - * the entry data buffer by FETCH_OP_ST_EDATA. The FETCH_OP_ST_EDATA - * stores the data at the data buffer + its offset, and all data are - * "unsigned long" size. The offset must be increased when a data is - * stored. Thus we need to find the last FETCH_OP_ST_EDATA in the - * code array. - */ - for (i = 0; i < earg->size; i++) { - switch (earg->code[i].op) { - case FETCH_OP_END: - goto out; - case FETCH_OP_ST_EDATA: - size = earg->code[i].offset + sizeof(unsigned long); - break; - default: - break; - } - } -out: - return size; + return get_entry_arg_max_offset(earg) + sizeof(unsigned long); } void store_trace_entry_data(void *edata, struct trace_probe *tp, struct pt_regs *regs) diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 854e5668f5ee..842383fbc03b 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -10,20 +10,22 @@ * Author: Srikar Dronamraju */ +#include <linux/bitops.h> +#include <linux/btf.h> +#include <linux/cleanup.h> +#include <linux/kprobes.h> +#include <linux/limits.h> +#include <linux/perf_event.h> +#include <linux/ptrace.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/smp.h> -#include <linux/tracefs.h> -#include <linux/types.h> #include <linux/string.h> -#include <linux/ptrace.h> -#include <linux/perf_event.h> -#include <linux/kprobes.h> #include <linux/stringify.h> -#include <linux/limits.h> +#include <linux/tracefs.h> +#include <linux/types.h> #include <linux/uaccess.h> -#include <linux/bitops.h> -#include <linux/btf.h> + #include <asm/bitsperlong.h> #include "trace.h" @@ -438,6 +440,14 @@ extern void traceprobe_free_probe_arg(struct probe_arg *arg); * this MUST be called for clean up the context and return a resource. */ void traceprobe_finish_parse(struct traceprobe_parse_context *ctx); +static inline void traceprobe_free_parse_ctx(struct traceprobe_parse_context *ctx) +{ + traceprobe_finish_parse(ctx); + kfree(ctx); +} + +DEFINE_FREE(traceprobe_parse_context, struct traceprobe_parse_context *, + if (_T) traceprobe_free_parse_ctx(_T)) extern int traceprobe_split_symbol_offset(char *symbol, long *offset); int traceprobe_parse_event_name(const char **pevent, const char **pgroup, diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index a0db3404f7f7..bf1cb80742ae 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -83,14 +83,14 @@ func_prolog_preempt_disable(struct trace_array *tr, goto out_enable; *data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&(*data)->disabled); + disabled = local_inc_return(&(*data)->disabled); if (unlikely(disabled != 1)) goto out; return 1; out: - atomic_dec(&(*data)->disabled); + local_dec(&(*data)->disabled); out_enable: preempt_enable_notrace(); @@ -144,7 +144,7 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace, *calltime = trace_clock_local(); ret = __trace_graph_entry(tr, trace, trace_ctx); - atomic_dec(&data->disabled); + local_dec(&data->disabled); preempt_enable_notrace(); return ret; @@ -173,7 +173,7 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace, return; __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); - atomic_dec(&data->disabled); + local_dec(&data->disabled); preempt_enable_notrace(); return; @@ -243,7 +243,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, trace_function(tr, ip, parent_ip, trace_ctx, fregs); local_irq_restore(flags); - atomic_dec(&data->disabled); + local_dec(&data->disabled); preempt_enable_notrace(); } @@ -471,7 +471,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, /* disable local data, not wakeup_cpu data */ cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); + disabled = local_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); if (likely(disabled != 1)) goto out; @@ -508,7 +508,7 @@ out_unlock: arch_spin_unlock(&wakeup_lock); local_irq_restore(flags); out: - atomic_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); + local_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); } static void __wakeup_reset(struct trace_array *tr) @@ -563,7 +563,7 @@ probe_wakeup(void *ignore, struct task_struct *p) (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) return; - disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); + disabled = local_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); if (unlikely(disabled != 1)) goto out; @@ -610,7 +610,7 @@ probe_wakeup(void *ignore, struct task_struct *p) out_locked: arch_spin_unlock(&wakeup_lock); out: - atomic_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); + local_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); } static void start_wakeup_tracer(struct trace_array *tr) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 14c6f272c4d8..0aa2514a6593 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -32,7 +32,7 @@ static arch_spinlock_t stack_trace_max_lock = DEFINE_PER_CPU(int, disable_stack_tracer); static DEFINE_MUTEX(stack_sysctl_mutex); -int stack_tracer_enabled; +static int stack_tracer_enabled; static void print_max_stack(void) { @@ -542,7 +542,7 @@ static __init int enable_stacktrace(char *str) int len; if ((len = str_has_prefix(str, "_filter="))) - strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE); + strscpy(stack_trace_filter_buf, str + len); stack_tracer_enabled = 1; return 1; @@ -578,3 +578,23 @@ static __init int stack_trace_init(void) } device_initcall(stack_trace_init); + + +static const struct ctl_table trace_stack_sysctl_table[] = { + { + .procname = "stack_tracer_enabled", + .data = &stack_tracer_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = stack_trace_sysctl, + }, +}; + +static int __init init_trace_stack_sysctls(void) +{ + register_sysctl_init("kernel", trace_stack_sysctl_table); + return 0; +} +subsys_initcall(init_trace_stack_sysctls); + + diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 35cf76c75dd7..8b0bcc0d8f41 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -8,17 +8,19 @@ #define pr_fmt(fmt) "trace_uprobe: " fmt #include <linux/bpf-cgroup.h> -#include <linux/security.h> +#include <linux/cleanup.h> #include <linux/ctype.h> +#include <linux/filter.h> #include <linux/module.h> -#include <linux/uaccess.h> -#include <linux/uprobes.h> #include <linux/namei.h> -#include <linux/string.h> -#include <linux/rculist.h> -#include <linux/filter.h> #include <linux/percpu.h> +#include <linux/rculist.h> +#include <linux/security.h> +#include <linux/string.h> +#include <linux/uaccess.h> +#include <linux/uprobes.h> +#include "trace.h" #include "trace_dynevent.h" #include "trace_probe.h" #include "trace_probe_tmpl.h" @@ -537,15 +539,15 @@ static int register_trace_uprobe(struct trace_uprobe *tu) */ static int __trace_uprobe_create(int argc, const char **argv) { - struct trace_uprobe *tu; const char *event = NULL, *group = UPROBE_EVENT_SYSTEM; char *arg, *filename, *rctr, *rctr_end, *tmp; - char buf[MAX_EVENT_NAME_LEN]; - char gbuf[MAX_EVENT_NAME_LEN]; - enum probe_print_type ptype; - struct path path; unsigned long offset, ref_ctr_offset; + char *gbuf __free(kfree) = NULL; + char *buf __free(kfree) = NULL; + enum probe_print_type ptype; + struct trace_uprobe *tu; bool is_return = false; + struct path path; int i, ret; ref_ctr_offset = 0; @@ -653,6 +655,10 @@ static int __trace_uprobe_create(int argc, const char **argv) /* setup a probe */ trace_probe_log_set_index(0); if (event) { + gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!gbuf) + goto fail_mem; + ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) @@ -664,15 +670,16 @@ static int __trace_uprobe_create(int argc, const char **argv) char *ptr; tail = kstrdup(kbasename(filename), GFP_KERNEL); - if (!tail) { - ret = -ENOMEM; - goto fail_address_parse; - } + if (!tail) + goto fail_mem; ptr = strpbrk(tail, ".-_"); if (ptr) *ptr = '\0'; + buf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!buf) + goto fail_mem; snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset); event = buf; kfree(tail); @@ -695,13 +702,16 @@ static int __trace_uprobe_create(int argc, const char **argv) /* parse arguments */ for (i = 0; i < argc; i++) { - struct traceprobe_parse_context ctx = { - .flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER, - }; + struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) + = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) { + ret = -ENOMEM; + goto error; + } + ctx->flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER; trace_probe_log_set_index(i + 2); - ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], &ctx); - traceprobe_finish_parse(&ctx); + ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], ctx); if (ret) goto error; } @@ -721,6 +731,9 @@ out: trace_probe_log_clear(); return ret; +fail_mem: + ret = -ENOMEM; + fail_address_parse: trace_probe_log_clear(); path_put(&path); @@ -1489,7 +1502,7 @@ int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, : BPF_FD_TYPE_UPROBE; *filename = tu->filename; *probe_offset = tu->offset; - *probe_addr = 0; + *probe_addr = tu->ref_ctr_offset; return 0; } #endif /* CONFIG_PERF_EVENTS */ |